From bb3ca38ef7aa56dcfa7f6e81675c7a39d5ee9bf1 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Fri, 3 May 2024 08:43:11 -0700 Subject: [PATCH 001/272] hv_balloon: Use kernel macros to simplify open coded sequences Code sequences equivalent to ALIGN(), ALIGN_DOWN(), and umin() are currently open coded. Change these to use the kernel macro to improve code clarity. ALIGN() and ALIGN_DOWN() require the alignment value to be a power of 2, which is the case here. Reviewed-by: David Hildenbrand Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/20240503154312.142466-1-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240503154312.142466-1-mhklinux@outlook.com> --- drivers/hv/hv_balloon.c | 40 ++++++++-------------------------------- 1 file changed, 8 insertions(+), 32 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index e000fa3b9f978..9f45b8a6762c8 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -729,15 +729,8 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { has->ha_end_pfn += HA_CHUNK; - - if (total_pfn > HA_CHUNK) { - processed_pfn = HA_CHUNK; - total_pfn -= HA_CHUNK; - } else { - processed_pfn = total_pfn; - total_pfn = 0; - } - + processed_pfn = umin(total_pfn, HA_CHUNK); + total_pfn -= processed_pfn; has->covered_end_pfn += processed_pfn; } @@ -800,7 +793,7 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) { struct hv_hotadd_state *has; struct hv_hotadd_gap *gap; - unsigned long residual, new_inc; + unsigned long residual; int ret = 0; guard(spinlock_irqsave)(&dm_device.ha_lock); @@ -836,15 +829,9 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) * our current limit; extend it. */ if ((start_pfn + pfn_cnt) > has->end_pfn) { + /* Extend the region by multiples of HA_CHUNK */ residual = (start_pfn + pfn_cnt - has->end_pfn); - /* - * Extend the region by multiples of HA_CHUNK. - */ - new_inc = (residual / HA_CHUNK) * HA_CHUNK; - if (residual % HA_CHUNK) - new_inc += HA_CHUNK; - - has->end_pfn += new_inc; + has->end_pfn += ALIGN(residual, HA_CHUNK); } ret = 1; @@ -915,9 +902,7 @@ static unsigned long handle_pg_range(unsigned long pg_start, */ size = (has->end_pfn - has->ha_end_pfn); if (pfn_cnt <= size) { - size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK); - if (pfn_cnt % HA_CHUNK) - size += HA_CHUNK; + size = ALIGN(pfn_cnt, HA_CHUNK); } else { pfn_cnt = size; } @@ -1011,9 +996,6 @@ static void hot_add_req(struct work_struct *dummy) rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt; if ((rg_start == 0) && (!dm->host_specified_ha_region)) { - unsigned long region_size; - unsigned long region_start; - /* * The host has not specified the hot-add region. * Based on the hot-add page range being specified, @@ -1021,14 +1003,8 @@ static void hot_add_req(struct work_struct *dummy) * that need to be hot-added while ensuring the alignment * and size requirements of Linux as it relates to hot-add. */ - region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK; - if (pfn_cnt % HA_CHUNK) - region_size += HA_CHUNK; - - region_start = (pg_start / HA_CHUNK) * HA_CHUNK; - - rg_start = region_start; - rg_sz = region_size; + rg_start = ALIGN_DOWN(pg_start, HA_CHUNK); + rg_sz = ALIGN(pfn_cnt, HA_CHUNK); } if (do_hot_add) From 8852ebf1948d94ecaf4d1113032dda7e58e72b84 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Fri, 3 May 2024 08:43:12 -0700 Subject: [PATCH 002/272] hv_balloon: Enable hot-add for memblock sizes > 128 MiB The Hyper-V balloon driver supports hot-add of memory in addition to ballooning. Current code hot-adds in fixed size chunks of 128 MiB (fixed constant HA_CHUNK in the code). While this works in Hyper-V VMs with 64 GiB or less or memory where the Linux memblock size is 128 MiB, the hot-add fails for larger memblock sizes because add_memory() expects memory to be added in chunks that match the memblock size. Messages like the following are reported when Linux has a 256 MiB memblock size: [ 312.668859] Block size [0x10000000] unaligned hotplug range: start 0x310000000, size 0x8000000 [ 312.668880] hv_balloon: hot_add memory failed error is -22 [ 312.668984] hv_balloon: Memory hot add failed Larger memblock sizes are usually used in VMs with more than 64 GiB of memory, depending on the alignment of the VM's physical address space. Fix this problem by having the Hyper-V balloon driver determine the Linux memblock size, and process hot-add requests in that chunk size instead of a fixed 128 MiB. Also update the hot-add alignment requested of the Hyper-V host to match the memblock size. The code changes look significant, but in fact are just a simple text substitution of a new global variable for the previous HA_CHUNK constant. No algorithms are changed except to initialize the new global variable and to calculate the alignment value to pass to Hyper-V. Testing with memblock sizes of 256 MiB and 2 GiB shows correct operation. Reviewed-by: David Hildenbrand Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/20240503154312.142466-2-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240503154312.142466-2-mhklinux@outlook.com> --- drivers/hv/hv_balloon.c | 64 +++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 21 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 9f45b8a6762c8..4370ad31b5b38 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -425,11 +426,11 @@ struct dm_info_msg { * The range start_pfn : end_pfn specifies the range * that the host has asked us to hot add. The range * start_pfn : ha_end_pfn specifies the range that we have - * currently hot added. We hot add in multiples of 128M - * chunks; it is possible that we may not be able to bring - * online all the pages in the region. The range + * currently hot added. We hot add in chunks equal to the + * memory block size; it is possible that we may not be able + * to bring online all the pages in the region. The range * covered_start_pfn:covered_end_pfn defines the pages that can - * be brough online. + * be brought online. */ struct hv_hotadd_state { @@ -505,8 +506,11 @@ enum hv_dm_state { static __u8 recv_buffer[HV_HYP_PAGE_SIZE]; static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE]; + +static unsigned long ha_pages_in_chunk; +#define HA_BYTES_IN_CHUNK (ha_pages_in_chunk << PAGE_SHIFT) + #define PAGES_IN_2M (2 * 1024 * 1024 / PAGE_SIZE) -#define HA_CHUNK (128 * 1024 * 1024 / PAGE_SIZE) struct hv_dynmem_device { struct hv_device *dev; @@ -724,21 +728,21 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, unsigned long processed_pfn; unsigned long total_pfn = pfn_count; - for (i = 0; i < (size/HA_CHUNK); i++) { - start_pfn = start + (i * HA_CHUNK); + for (i = 0; i < (size/ha_pages_in_chunk); i++) { + start_pfn = start + (i * ha_pages_in_chunk); scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { - has->ha_end_pfn += HA_CHUNK; - processed_pfn = umin(total_pfn, HA_CHUNK); + has->ha_end_pfn += ha_pages_in_chunk; + processed_pfn = umin(total_pfn, ha_pages_in_chunk); total_pfn -= processed_pfn; - has->covered_end_pfn += processed_pfn; + has->covered_end_pfn += processed_pfn; } reinit_completion(&dm_device.ol_waitevent); nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); ret = add_memory(nid, PFN_PHYS((start_pfn)), - (HA_CHUNK << PAGE_SHIFT), MHP_MERGE_RESOURCE); + HA_BYTES_IN_CHUNK, MHP_MERGE_RESOURCE); if (ret) { pr_err("hot_add memory failed error is %d\n", ret); @@ -753,7 +757,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, do_hot_add = false; } scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { - has->ha_end_pfn -= HA_CHUNK; + has->ha_end_pfn -= ha_pages_in_chunk; has->covered_end_pfn -= processed_pfn; } break; @@ -829,9 +833,9 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) * our current limit; extend it. */ if ((start_pfn + pfn_cnt) > has->end_pfn) { - /* Extend the region by multiples of HA_CHUNK */ + /* Extend the region by multiples of ha_pages_in_chunk */ residual = (start_pfn + pfn_cnt - has->end_pfn); - has->end_pfn += ALIGN(residual, HA_CHUNK); + has->end_pfn += ALIGN(residual, ha_pages_in_chunk); } ret = 1; @@ -897,12 +901,12 @@ static unsigned long handle_pg_range(unsigned long pg_start, * We have some residual hot add range * that needs to be hot added; hot add * it now. Hot add a multiple of - * HA_CHUNK that fully covers the pages + * ha_pages_in_chunk that fully covers the pages * we have. */ size = (has->end_pfn - has->ha_end_pfn); if (pfn_cnt <= size) { - size = ALIGN(pfn_cnt, HA_CHUNK); + size = ALIGN(pfn_cnt, ha_pages_in_chunk); } else { pfn_cnt = size; } @@ -1003,8 +1007,8 @@ static void hot_add_req(struct work_struct *dummy) * that need to be hot-added while ensuring the alignment * and size requirements of Linux as it relates to hot-add. */ - rg_start = ALIGN_DOWN(pg_start, HA_CHUNK); - rg_sz = ALIGN(pfn_cnt, HA_CHUNK); + rg_start = ALIGN_DOWN(pg_start, ha_pages_in_chunk); + rg_sz = ALIGN(pfn_cnt, ha_pages_in_chunk); } if (do_hot_add) @@ -1807,10 +1811,13 @@ static int balloon_connect_vsp(struct hv_device *dev) cap_msg.caps.cap_bits.hot_add = hot_add_enabled(); /* - * Specify our alignment requirements as it relates - * memory hot-add. Specify 128MB alignment. + * Specify our alignment requirements for memory hot-add. The value is + * the log base 2 of the number of megabytes in a chunk. For example, + * with 256 MiB chunks, the value is 8. The number of MiB in a chunk + * must be a power of 2. */ - cap_msg.caps.cap_bits.hot_add_alignment = 7; + cap_msg.caps.cap_bits.hot_add_alignment = + ilog2(HA_BYTES_IN_CHUNK / SZ_1M); /* * Currently the host does not use these @@ -1960,8 +1967,23 @@ static int balloon_probe(struct hv_device *dev, hot_add = false; #ifdef CONFIG_MEMORY_HOTPLUG + /* + * Hot-add must operate in chunks that are of size equal to the + * memory block size because that's what the core add_memory() + * interface requires. The Hyper-V interface requires that the memory + * block size be a power of 2, which is guaranteed by the check in + * memory_dev_init(). + */ + ha_pages_in_chunk = memory_block_size_bytes() / PAGE_SIZE; do_hot_add = hot_add; #else + /* + * Without MEMORY_HOTPLUG, the guest returns a failure status for all + * hot add requests from Hyper-V, and the chunk size is used only to + * specify alignment to Hyper-V as required by the host/guest protocol. + * Somewhat arbitrarily, use 128 MiB. + */ + ha_pages_in_chunk = SZ_128M / PAGE_SIZE; do_hot_add = false; #endif dm_device.dev = dev; From 207e03b00b47ccbd692941b183510026e1bd6ce9 Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Sun, 5 May 2024 22:38:58 -0700 Subject: [PATCH 003/272] tools: hv: suppress the invalid warning for packed member alignment Packed struct vmbus_bufring is 4096 byte aligned and the reporting warning is for the first member of that struct which shouldn't add any offset to create alignment issue. Suppress the warning by adding -Wno-address-of-packed-member flag to gcc. Fixes: 45bab4d74651 ("tools: hv: Add vmbus_bufring") Reported-by: kernel test robot Closes: https://lore.kernel.org/all/202404121913.GhtSoKbW-lkp@intel.com/ Signed-off-by: Saurabh Sengar Link: https://lore.kernel.org/r/1714973938-4063-1-git-send-email-ssengar@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1714973938-4063-1-git-send-email-ssengar@linux.microsoft.com> --- tools/hv/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/hv/Makefile b/tools/hv/Makefile index bb52871da3412..2e60e2c212cd9 100644 --- a/tools/hv/Makefile +++ b/tools/hv/Makefile @@ -17,6 +17,7 @@ endif MAKEFLAGS += -r override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include +override CFLAGS += -Wno-address-of-packed-member ALL_TARGETS := hv_kvp_daemon hv_vss_daemon ifneq ($(ARCH), aarch64) From 4c5a65fd10895708952106652b2ac2ca3b7bb9d9 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sat, 11 May 2024 06:38:17 -0700 Subject: [PATCH 004/272] Documentation: hyperv: Update spelling and fix typo Update spelling from "VMbus" to "VMBus" to match Hyper-V product documentation. Also correct typo: "SNP-SEV" should be "SEV-SNP". Signed-off-by: Michael Kelley Reviewed-by: Easwar Hariharan Reviewed-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20240511133818.19649-1-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240511133818.19649-1-mhklinux@outlook.com> --- Documentation/virt/hyperv/overview.rst | 22 +++---- Documentation/virt/hyperv/vmbus.rst | 82 +++++++++++++------------- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/Documentation/virt/hyperv/overview.rst b/Documentation/virt/hyperv/overview.rst index cd493332c88a6..77408a89d1a40 100644 --- a/Documentation/virt/hyperv/overview.rst +++ b/Documentation/virt/hyperv/overview.rst @@ -40,7 +40,7 @@ Linux guests communicate with Hyper-V in four different ways: arm64, these synthetic registers must be accessed using explicit hypercalls. -* VMbus: VMbus is a higher-level software construct that is built on +* VMBus: VMBus is a higher-level software construct that is built on the other 3 mechanisms. It is a message passing interface between the Hyper-V host and the Linux guest. It uses memory that is shared between Hyper-V and the guest, along with various signaling @@ -54,8 +54,8 @@ x86/x64 architecture only. .. _Hyper-V Top Level Functional Spec (TLFS): https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/tlfs -VMbus is not documented. This documentation provides a high-level -overview of VMbus and how it works, but the details can be discerned +VMBus is not documented. This documentation provides a high-level +overview of VMBus and how it works, but the details can be discerned only from the code. Sharing Memory @@ -74,7 +74,7 @@ follows: physical address space. How Hyper-V is told about the GPA or list of GPAs varies. In some cases, a single GPA is written to a synthetic register. In other cases, a GPA or list of GPAs is sent - in a VMbus message. + in a VMBus message. * Hyper-V translates the GPAs into "real" physical memory addresses, and creates a virtual mapping that it can use to access the memory. @@ -133,9 +133,9 @@ only the CPUs actually present in the VM, so Linux does not report any hot-add CPUs. A Linux guest CPU may be taken offline using the normal Linux -mechanisms, provided no VMbus channel interrupts are assigned to -the CPU. See the section on VMbus Interrupts for more details -on how VMbus channel interrupts can be re-assigned to permit +mechanisms, provided no VMBus channel interrupts are assigned to +the CPU. See the section on VMBus Interrupts for more details +on how VMBus channel interrupts can be re-assigned to permit taking a CPU offline. 32-bit and 64-bit @@ -169,14 +169,14 @@ and functionality. Hyper-V indicates feature/function availability via flags in synthetic MSRs that Hyper-V provides to the guest, and the guest code tests these flags. -VMbus has its own protocol version that is negotiated during the -initial VMbus connection from the guest to Hyper-V. This version +VMBus has its own protocol version that is negotiated during the +initial VMBus connection from the guest to Hyper-V. This version number is also output to dmesg during boot. This version number is checked in a few places in the code to determine if specific functionality is present. -Furthermore, each synthetic device on VMbus also has a protocol -version that is separate from the VMbus protocol version. Device +Furthermore, each synthetic device on VMBus also has a protocol +version that is separate from the VMBus protocol version. Device drivers for these synthetic devices typically negotiate the device protocol version, and may test that protocol version to determine if specific device functionality is present. diff --git a/Documentation/virt/hyperv/vmbus.rst b/Documentation/virt/hyperv/vmbus.rst index d2012d9022c5e..f0d83ebda626a 100644 --- a/Documentation/virt/hyperv/vmbus.rst +++ b/Documentation/virt/hyperv/vmbus.rst @@ -1,8 +1,8 @@ .. SPDX-License-Identifier: GPL-2.0 -VMbus +VMBus ===== -VMbus is a software construct provided by Hyper-V to guest VMs. It +VMBus is a software construct provided by Hyper-V to guest VMs. It consists of a control path and common facilities used by synthetic devices that Hyper-V presents to guest VMs. The control path is used to offer synthetic devices to the guest VM and, in some cases, @@ -12,9 +12,9 @@ and the synthetic device implementation that is part of Hyper-V, and signaling primitives to allow Hyper-V and the guest to interrupt each other. -VMbus is modeled in Linux as a bus, with the expected /sys/bus/vmbus -entry in a running Linux guest. The VMbus driver (drivers/hv/vmbus_drv.c) -establishes the VMbus control path with the Hyper-V host, then +VMBus is modeled in Linux as a bus, with the expected /sys/bus/vmbus +entry in a running Linux guest. The VMBus driver (drivers/hv/vmbus_drv.c) +establishes the VMBus control path with the Hyper-V host, then registers itself as a Linux bus driver. It implements the standard bus functions for adding and removing devices to/from the bus. @@ -49,9 +49,9 @@ synthetic NIC is referred to as "netvsc" and the Linux driver for the synthetic SCSI controller is "storvsc". These drivers contain functions with names like "storvsc_connect_to_vsp". -VMbus channels +VMBus channels -------------- -An instance of a synthetic device uses VMbus channels to communicate +An instance of a synthetic device uses VMBus channels to communicate between the VSP and the VSC. Channels are bi-directional and used for passing messages. Most synthetic devices use a single channel, but the synthetic SCSI controller and synthetic NIC may use multiple @@ -73,7 +73,7 @@ write indices and some control flags, followed by the memory for the actual ring. The size of the ring is determined by the VSC in the guest and is specific to each synthetic device. The list of GPAs making up the ring is communicated to the Hyper-V host over the -VMbus control path as a GPA Descriptor List (GPADL). See function +VMBus control path as a GPA Descriptor List (GPADL). See function vmbus_establish_gpadl(). Each ring buffer is mapped into contiguous Linux kernel virtual @@ -102,9 +102,9 @@ resources. For Windows Server 2019 and later, this limit is approximately 1280 Mbytes. For versions prior to Windows Server 2019, the limit is approximately 384 Mbytes. -VMbus messages +VMBus messages -------------- -All VMbus messages have a standard header that includes the message +All VMBus messages have a standard header that includes the message length, the offset of the message payload, some flags, and a transactionID. The portion of the message after the header is unique to each VSP/VSC pair. @@ -137,7 +137,7 @@ control message contains a list of GPAs that describe the data buffer. For example, the storvsc driver uses this approach to specify the data buffers to/from which disk I/O is done. -Three functions exist to send VMbus messages: +Three functions exist to send VMBus messages: 1. vmbus_sendpacket(): Control-only messages and messages with embedded data -- no GPAs @@ -154,20 +154,20 @@ Historically, Linux guests have trusted Hyper-V to send well-formed and valid messages, and Linux drivers for synthetic devices did not fully validate messages. With the introduction of processor technologies that fully encrypt guest memory and that allow the -guest to not trust the hypervisor (AMD SNP-SEV, Intel TDX), trusting +guest to not trust the hypervisor (AMD SEV-SNP, Intel TDX), trusting the Hyper-V host is no longer a valid assumption. The drivers for -VMbus synthetic devices are being updated to fully validate any +VMBus synthetic devices are being updated to fully validate any values read from memory that is shared with Hyper-V, which includes -messages from VMbus devices. To facilitate such validation, +messages from VMBus devices. To facilitate such validation, messages read by the guest from the "in" ring buffer are copied to a temporary buffer that is not shared with Hyper-V. Validation is performed in this temporary buffer without the risk of Hyper-V maliciously modifying the message after it is validated but before it is used. -VMbus interrupts +VMBus interrupts ---------------- -VMbus provides a mechanism for the guest to interrupt the host when +VMBus provides a mechanism for the guest to interrupt the host when the guest has queued new messages in a ring buffer. The host expects that the guest will send an interrupt only when an "out" ring buffer transitions from empty to non-empty. If the guest sends @@ -177,62 +177,62 @@ interrupts, the host may throttle that guest by suspending its execution for a few seconds to prevent a denial-of-service attack. Similarly, the host will interrupt the guest when it sends a new -message on the VMbus control path, or when a VMbus channel "in" ring +message on the VMBus control path, or when a VMBus channel "in" ring buffer transitions from empty to non-empty. Each CPU in the guest -may receive VMbus interrupts, so they are best modeled as per-CPU +may receive VMBus interrupts, so they are best modeled as per-CPU interrupts in Linux. This model works well on arm64 where a single -per-CPU IRQ is allocated for VMbus. Since x86/x64 lacks support for +per-CPU IRQ is allocated for VMBus. Since x86/x64 lacks support for per-CPU IRQs, an x86 interrupt vector is statically allocated (see HYPERVISOR_CALLBACK_VECTOR) across all CPUs and explicitly coded to -call the VMbus interrupt service routine. These interrupts are +call the VMBus interrupt service routine. These interrupts are visible in /proc/interrupts on the "HYP" line. -The guest CPU that a VMbus channel will interrupt is selected by the +The guest CPU that a VMBus channel will interrupt is selected by the guest when the channel is created, and the host is informed of that -selection. VMbus devices are broadly grouped into two categories: +selection. VMBus devices are broadly grouped into two categories: -1. "Slow" devices that need only one VMbus channel. The devices +1. "Slow" devices that need only one VMBus channel. The devices (such as keyboard, mouse, heartbeat, and timesync) generate - relatively few interrupts. Their VMbus channels are all + relatively few interrupts. Their VMBus channels are all assigned to interrupt the VMBUS_CONNECT_CPU, which is always CPU 0. -2. "High speed" devices that may use multiple VMbus channels for +2. "High speed" devices that may use multiple VMBus channels for higher parallelism and performance. These devices include the - synthetic SCSI controller and synthetic NIC. Their VMbus + synthetic SCSI controller and synthetic NIC. Their VMBus channels interrupts are assigned to CPUs that are spread out among the available CPUs in the VM so that interrupts on multiple channels can be processed in parallel. -The assignment of VMbus channel interrupts to CPUs is done in the +The assignment of VMBus channel interrupts to CPUs is done in the function init_vp_index(). This assignment is done outside of the normal Linux interrupt affinity mechanism, so the interrupts are neither "unmanaged" nor "managed" interrupts. -The CPU that a VMbus channel will interrupt can be seen in +The CPU that a VMBus channel will interrupt can be seen in /sys/bus/vmbus/devices// channels//cpu. When running on later versions of Hyper-V, the CPU can be changed by writing a new value to this sysfs entry. Because the interrupt assignment is done outside of the normal Linux affinity mechanism, there are no entries in /proc/irq corresponding to individual -VMbus channel interrupts. +VMBus channel interrupts. An online CPU in a Linux guest may not be taken offline if it has -VMbus channel interrupts assigned to it. Any such channel +VMBus channel interrupts assigned to it. Any such channel interrupts must first be manually reassigned to another CPU as described above. When no channel interrupts are assigned to the CPU, it can be taken offline. -When a guest CPU receives a VMbus interrupt from the host, the +When a guest CPU receives a VMBus interrupt from the host, the function vmbus_isr() handles the interrupt. It first checks for channel interrupts by calling vmbus_chan_sched(), which looks at a bitmap setup by the host to determine which channels have pending interrupts on this CPU. If multiple channels have pending interrupts for this CPU, they are processed sequentially. When all channel interrupts have been processed, vmbus_isr() checks for and -processes any message received on the VMbus control path. +processes any message received on the VMBus control path. -The VMbus channel interrupt handling code is designed to work +The VMBus channel interrupt handling code is designed to work correctly even if an interrupt is received on a CPU other than the CPU assigned to the channel. Specifically, the code does not use CPU-based exclusion for correctness. In normal operation, Hyper-V @@ -242,23 +242,23 @@ when Hyper-V will make the transition. The code must work correctly even if there is a time lag before Hyper-V starts interrupting the new CPU. See comments in target_cpu_store(). -VMbus device creation/deletion +VMBus device creation/deletion ------------------------------ Hyper-V and the Linux guest have a separate message-passing path that is used for synthetic device creation and deletion. This -path does not use a VMbus channel. See vmbus_post_msg() and +path does not use a VMBus channel. See vmbus_post_msg() and vmbus_on_msg_dpc(). The first step is for the guest to connect to the generic -Hyper-V VMbus mechanism. As part of establishing this connection, -the guest and Hyper-V agree on a VMbus protocol version they will +Hyper-V VMBus mechanism. As part of establishing this connection, +the guest and Hyper-V agree on a VMBus protocol version they will use. This negotiation allows newer Linux kernels to run on older Hyper-V versions, and vice versa. The guest then tells Hyper-V to "send offers". Hyper-V sends an offer message to the guest for each synthetic device that the VM -is configured to have. Each VMbus device type has a fixed GUID -known as the "class ID", and each VMbus device instance is also +is configured to have. Each VMBus device type has a fixed GUID +known as the "class ID", and each VMBus device instance is also identified by a GUID. The offer message from Hyper-V contains both GUIDs to uniquely (within the VM) identify the device. There is one offer message for each device instance, so a VM with @@ -275,7 +275,7 @@ type based on the class ID, and invokes the correct driver to set up the device. Driver/device matching is performed using the standard Linux mechanism. -The device driver probe function opens the primary VMbus channel to +The device driver probe function opens the primary VMBus channel to the corresponding VSP. It allocates guest memory for the channel ring buffers and shares the ring buffer with the Hyper-V host by giving the host a list of GPAs for the ring buffer memory. See @@ -285,7 +285,7 @@ Once the ring buffer is set up, the device driver and VSP exchange setup messages via the primary channel. These messages may include negotiating the device protocol version to be used between the Linux VSC and the VSP on the Hyper-V host. The setup messages may also -include creating additional VMbus channels, which are somewhat +include creating additional VMBus channels, which are somewhat mis-named as "sub-channels" since they are functionally equivalent to the primary channel once they are created. From a0b134032e6c5552635c7142ad7f181eba2f3256 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sat, 11 May 2024 06:38:18 -0700 Subject: [PATCH 005/272] Documentation: hyperv: Improve synic and interrupt handling description Current documentation does not describe how Linux handles the synthetic interrupt controller (synic) that Hyper-V provides to guest VMs, nor how VMBus or timer interrupts are handled. Add text describing the synic and reorganize existing text to make this more clear. Signed-off-by: Michael Kelley Reviewed-by: Easwar Hariharan Reviewed-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20240511133818.19649-2-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240511133818.19649-2-mhklinux@outlook.com> --- Documentation/virt/hyperv/clocks.rst | 21 +++++--- Documentation/virt/hyperv/vmbus.rst | 79 ++++++++++++++++++---------- 2 files changed, 66 insertions(+), 34 deletions(-) diff --git a/Documentation/virt/hyperv/clocks.rst b/Documentation/virt/hyperv/clocks.rst index a56f4837d4433..1760432658039 100644 --- a/Documentation/virt/hyperv/clocks.rst +++ b/Documentation/virt/hyperv/clocks.rst @@ -62,12 +62,21 @@ shared page with scale and offset values into user space. User space code performs the same algorithm of reading the TSC and applying the scale and offset to get the constant 10 MHz clock. -Linux clockevents are based on Hyper-V synthetic timer 0. While -Hyper-V offers 4 synthetic timers for each CPU, Linux only uses -timer 0. Interrupts from stimer0 are recorded on the "HVS" line in -/proc/interrupts. Clockevents based on the virtualized PIT and -local APIC timer also work, but the Hyper-V synthetic timer is -preferred. +Linux clockevents are based on Hyper-V synthetic timer 0 (stimer0). +While Hyper-V offers 4 synthetic timers for each CPU, Linux only uses +timer 0. In older versions of Hyper-V, an interrupt from stimer0 +results in a VMBus control message that is demultiplexed by +vmbus_isr() as described in the Documentation/virt/hyperv/vmbus.rst +documentation. In newer versions of Hyper-V, stimer0 interrupts can +be mapped to an architectural interrupt, which is referred to as +"Direct Mode". Linux prefers to use Direct Mode when available. Since +x86/x64 doesn't support per-CPU interrupts, Direct Mode statically +allocates an x86 interrupt vector (HYPERV_STIMER0_VECTOR) across all CPUs +and explicitly codes it to call the stimer0 interrupt handler. Hence +interrupts from stimer0 are recorded on the "HVS" line in /proc/interrupts +rather than being associated with a Linux IRQ. Clockevents based on the +virtualized PIT and local APIC timer also work, but Hyper-V stimer0 +is preferred. The driver for the Hyper-V synthetic system clock and timers is drivers/clocksource/hyperv_timer.c. diff --git a/Documentation/virt/hyperv/vmbus.rst b/Documentation/virt/hyperv/vmbus.rst index f0d83ebda626a..1dcef6a7fda3f 100644 --- a/Documentation/virt/hyperv/vmbus.rst +++ b/Documentation/virt/hyperv/vmbus.rst @@ -102,10 +102,10 @@ resources. For Windows Server 2019 and later, this limit is approximately 1280 Mbytes. For versions prior to Windows Server 2019, the limit is approximately 384 Mbytes. -VMBus messages --------------- -All VMBus messages have a standard header that includes the message -length, the offset of the message payload, some flags, and a +VMBus channel messages +---------------------- +All messages sent in a VMBus channel have a standard header that includes +the message length, the offset of the message payload, some flags, and a transactionID. The portion of the message after the header is unique to each VSP/VSC pair. @@ -137,7 +137,7 @@ control message contains a list of GPAs that describe the data buffer. For example, the storvsc driver uses this approach to specify the data buffers to/from which disk I/O is done. -Three functions exist to send VMBus messages: +Three functions exist to send VMBus channel messages: 1. vmbus_sendpacket(): Control-only messages and messages with embedded data -- no GPAs @@ -165,6 +165,37 @@ performed in this temporary buffer without the risk of Hyper-V maliciously modifying the message after it is validated but before it is used. +Synthetic Interrupt Controller (synic) +-------------------------------------- +Hyper-V provides each guest CPU with a synthetic interrupt controller +that is used by VMBus for host-guest communication. While each synic +defines 16 synthetic interrupts (SINT), Linux uses only one of the 16 +(VMBUS_MESSAGE_SINT). All interrupts related to communication between +the Hyper-V host and a guest CPU use that SINT. + +The SINT is mapped to a single per-CPU architectural interrupt (i.e, +an 8-bit x86/x64 interrupt vector, or an arm64 PPI INTID). Because +each CPU in the guest has a synic and may receive VMBus interrupts, +they are best modeled in Linux as per-CPU interrupts. This model works +well on arm64 where a single per-CPU Linux IRQ is allocated for +VMBUS_MESSAGE_SINT. This IRQ appears in /proc/interrupts as an IRQ labelled +"Hyper-V VMbus". Since x86/x64 lacks support for per-CPU IRQs, an x86 +interrupt vector is statically allocated (HYPERVISOR_CALLBACK_VECTOR) +across all CPUs and explicitly coded to call vmbus_isr(). In this case, +there's no Linux IRQ, and the interrupts are visible in aggregate in +/proc/interrupts on the "HYP" line. + +The synic provides the means to demultiplex the architectural interrupt into +one or more logical interrupts and route the logical interrupt to the proper +VMBus handler in Linux. This demultiplexing is done by vmbus_isr() and +related functions that access synic data structures. + +The synic is not modeled in Linux as an irq chip or irq domain, +and the demultiplexed logical interrupts are not Linux IRQs. As such, +they don't appear in /proc/interrupts or /proc/irq. The CPU +affinity for one of these logical interrupts is controlled via an +entry under /sys/bus/vmbus as described below. + VMBus interrupts ---------------- VMBus provides a mechanism for the guest to interrupt the host when @@ -176,16 +207,18 @@ unnecessary. If a guest sends an excessive number of unnecessary interrupts, the host may throttle that guest by suspending its execution for a few seconds to prevent a denial-of-service attack. -Similarly, the host will interrupt the guest when it sends a new -message on the VMBus control path, or when a VMBus channel "in" ring -buffer transitions from empty to non-empty. Each CPU in the guest -may receive VMBus interrupts, so they are best modeled as per-CPU -interrupts in Linux. This model works well on arm64 where a single -per-CPU IRQ is allocated for VMBus. Since x86/x64 lacks support for -per-CPU IRQs, an x86 interrupt vector is statically allocated (see -HYPERVISOR_CALLBACK_VECTOR) across all CPUs and explicitly coded to -call the VMBus interrupt service routine. These interrupts are -visible in /proc/interrupts on the "HYP" line. +Similarly, the host will interrupt the guest via the synic when +it sends a new message on the VMBus control path, or when a VMBus +channel "in" ring buffer transitions from empty to non-empty due to +the host inserting a new VMBus channel message. The control message stream +and each VMBus channel "in" ring buffer are separate logical interrupts +that are demultiplexed by vmbus_isr(). It demultiplexes by first checking +for channel interrupts by calling vmbus_chan_sched(), which looks at a synic +bitmap to determine which channels have pending interrupts on this CPU. +If multiple channels have pending interrupts for this CPU, they are +processed sequentially. When all channel interrupts have been processed, +vmbus_isr() checks for and processes any messages received on the VMBus +control path. The guest CPU that a VMBus channel will interrupt is selected by the guest when the channel is created, and the host is informed of that @@ -212,10 +245,9 @@ neither "unmanaged" nor "managed" interrupts. The CPU that a VMBus channel will interrupt can be seen in /sys/bus/vmbus/devices// channels//cpu. When running on later versions of Hyper-V, the CPU can be changed -by writing a new value to this sysfs entry. Because the interrupt -assignment is done outside of the normal Linux affinity mechanism, -there are no entries in /proc/irq corresponding to individual -VMBus channel interrupts. +by writing a new value to this sysfs entry. Because VMBus channel +interrupts are not Linux IRQs, there are no entries in /proc/interrupts +or /proc/irq corresponding to individual VMBus channel interrupts. An online CPU in a Linux guest may not be taken offline if it has VMBus channel interrupts assigned to it. Any such channel @@ -223,15 +255,6 @@ interrupts must first be manually reassigned to another CPU as described above. When no channel interrupts are assigned to the CPU, it can be taken offline. -When a guest CPU receives a VMBus interrupt from the host, the -function vmbus_isr() handles the interrupt. It first checks for -channel interrupts by calling vmbus_chan_sched(), which looks at a -bitmap setup by the host to determine which channels have pending -interrupts on this CPU. If multiple channels have pending -interrupts for this CPU, they are processed sequentially. When all -channel interrupts have been processed, vmbus_isr() checks for and -processes any message received on the VMBus control path. - The VMBus channel interrupt handling code is designed to work correctly even if an interrupt is received on a CPU other than the CPU assigned to the channel. Specifically, the code does not use From db03d39053a97d2f2a6baec025ebdacbab5886d2 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 27 May 2024 15:44:48 +0200 Subject: [PATCH 006/272] ovl: fix copy-up in tmpfile Move ovl_copy_up() call outside of ovl_want_write()/ovl_drop_write() region, since copy up may also call ovl_want_write() resulting in recursive locking on sb->s_writers. Reported-and-tested-by: syzbot+85e58cdf5b3136471d4b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000f6865106191c3e58@google.com/ Fixes: 9a87907de359 ("ovl: implement tmpfile") Signed-off-by: Miklos Szeredi --- fs/overlayfs/dir.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 116f542442ddd..ab65e98a1defd 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1314,10 +1314,6 @@ static int ovl_create_tmpfile(struct file *file, struct dentry *dentry, int flags = file->f_flags | OVL_OPEN_FLAGS; int err; - err = ovl_copy_up(dentry->d_parent); - if (err) - return err; - old_cred = ovl_override_creds(dentry->d_sb); err = ovl_setup_cred_for_create(dentry, inode, mode, old_cred); if (err) @@ -1360,6 +1356,10 @@ static int ovl_tmpfile(struct mnt_idmap *idmap, struct inode *dir, if (!OVL_FS(dentry->d_sb)->tmpfile) return -EOPNOTSUPP; + err = ovl_copy_up(dentry->d_parent); + if (err) + return err; + err = ovl_want_write(dentry); if (err) return err; From 056620da899527c14cf36e5019a0decaf4cf0f79 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Mon, 20 May 2024 01:56:58 -0700 Subject: [PATCH 007/272] RDMA/bnxt_re: Fix the max msix vectors macro bnxt_re no longer decide the number of MSI-x vectors used by itself. Its decided by bnxt_en now. So when bnxt_en changes this value, system crash is seen. Depend on the max value reported by bnxt_en instead of using the its own macros. Fixes: 303432211324 ("bnxt_en: Remove runtime interrupt vector allocation") Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1716195418-11767-1-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 9dca451ed5221..6974922e5609a 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -107,8 +107,6 @@ struct bnxt_re_gsi_context { struct bnxt_re_sqp_entries *sqp_tbl; }; -#define BNXT_RE_MIN_MSIX 2 -#define BNXT_RE_MAX_MSIX 9 #define BNXT_RE_AEQ_IDX 0 #define BNXT_RE_NQ_IDX 1 #define BNXT_RE_GEN_P5_MAX_VF 64 @@ -168,7 +166,7 @@ struct bnxt_re_dev { struct bnxt_qplib_rcfw rcfw; /* NQ */ - struct bnxt_qplib_nq nq[BNXT_RE_MAX_MSIX]; + struct bnxt_qplib_nq nq[BNXT_MAX_ROCE_MSIX]; /* Device Resources */ struct bnxt_qplib_dev_attr dev_attr; From 03fa18a992d5626fd7bf3557a52e826bf8b326b3 Mon Sep 17 00:00:00 2001 From: Honggang LI Date: Thu, 16 May 2024 17:50:52 +0800 Subject: [PATCH 008/272] RDMA/rxe: Fix data copy for IB_SEND_INLINE For RDMA Send and Write with IB_SEND_INLINE, the memory buffers specified in sge list will be placed inline in the Send Request. The data should be copied by CPU from the virtual addresses of corresponding sge list DMA addresses. Cc: stable@kernel.org Fixes: 8d7c7c0eeb74 ("RDMA: Add ib_virt_dma_to_page()") Signed-off-by: Honggang LI Link: https://lore.kernel.org/r/20240516095052.542767-1-honggangli@163.com Reviewed-by: Zhu Yanjun Reviewed-by: Li Zhijian Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index c7d4d8ab5a094..de6238ee4379b 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -812,7 +812,7 @@ static void copy_inline_data_to_wqe(struct rxe_send_wqe *wqe, int i; for (i = 0; i < ibwr->num_sge; i++, sge++) { - memcpy(p, ib_virt_dma_to_page(sge->addr), sge->length); + memcpy(p, ib_virt_dma_to_ptr(sge->addr), sge->length); p += sge->length; } } From d339131bf02d4ed918415574082caf5e8af6e664 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Fri, 31 May 2024 12:27:16 +0100 Subject: [PATCH 009/272] ALSA: hda: cs35l56: Fix lifecycle of codec pointer The codec should be cleared when the amp driver is unbound and when resuming it should be tested to prevent loading firmware into the device and ALSA in a partially configured system state. Signed-off-by: Simon Trimmer Link: https://lore.kernel.org/r/20240531112716.25323-1-simont@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l56_hda.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sound/pci/hda/cs35l56_hda.c b/sound/pci/hda/cs35l56_hda.c index 11b0570ff56d4..0923e2589f5f7 100644 --- a/sound/pci/hda/cs35l56_hda.c +++ b/sound/pci/hda/cs35l56_hda.c @@ -735,6 +735,8 @@ static void cs35l56_hda_unbind(struct device *dev, struct device *master, void * if (comps[cs35l56->index].dev == dev) memset(&comps[cs35l56->index], 0, sizeof(*comps)); + cs35l56->codec = NULL; + dev_dbg(cs35l56->base.dev, "Unbound\n"); } @@ -840,6 +842,9 @@ static int cs35l56_hda_system_resume(struct device *dev) cs35l56->suspended = false; + if (!cs35l56->codec) + return 0; + ret = cs35l56_is_fw_reload_needed(&cs35l56->base); dev_dbg(cs35l56->base.dev, "fw_reload_needed: %d\n", ret); if (ret > 0) { From 6386682cdc8b41319c92fbbe421953e33a28840c Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Fri, 31 May 2024 13:08:20 +0100 Subject: [PATCH 010/272] ALSA: hda: cs35l41: Possible null pointer dereference in cs35l41_hda_unbind() The cs35l41_hda_unbind() function clears the hda_component entry matching it's index and then dereferences the codec pointer held in the first element of the hda_component array, this is an issue when the device index was 0. Instead use the codec pointer stashed in the cs35l41_hda structure as it will still be valid. Fixes: 7cf5ce66dfda ("ALSA: hda: cs35l41: Add device_link between HDA and cs35l41_hda") Signed-off-by: Simon Trimmer Link: https://lore.kernel.org/r/20240531120820.35367-1-simont@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index 6c49e5c6cd208..d54d4d60b03ec 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -1495,7 +1495,7 @@ static void cs35l41_hda_unbind(struct device *dev, struct device *master, void * if (comps[cs35l41->index].dev == dev) { memset(&comps[cs35l41->index], 0, sizeof(*comps)); sleep_flags = lock_system_sleep(); - device_link_remove(&comps->codec->core.dev, cs35l41->dev); + device_link_remove(&cs35l41->codec->core.dev, cs35l41->dev); unlock_system_sleep(sleep_flags); } } From 55fac50ea46f46a22a92e2139b92afaa3822ad19 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 31 May 2024 14:37:17 +0200 Subject: [PATCH 011/272] ALSA: seq: ump: Fix missing System Reset message handling The conversion from System Reset event to UMP was missing. Add the entry for a conversion to a proper UMP System message. Fixes: e9e02819a98a ("ALSA: seq: Automatic conversion of UMP events") Link: https://lore.kernel.org/r/20240531123718.13420-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_ump_convert.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c index 171fb75267afa..d81f776a4c3dd 100644 --- a/sound/core/seq/seq_ump_convert.c +++ b/sound/core/seq/seq_ump_convert.c @@ -1075,6 +1075,8 @@ static const struct seq_ev_to_ump seq_ev_ump_encoders[] = { system_ev_to_ump_midi1, system_ev_to_ump_midi2 }, { SNDRV_SEQ_EVENT_SENSING, UMP_SYSTEM_STATUS_ACTIVE_SENSING, system_ev_to_ump_midi1, system_ev_to_ump_midi2 }, + { SNDRV_SEQ_EVENT_RESET, UMP_SYSTEM_STATUS_RESET, + system_ev_to_ump_midi1, system_ev_to_ump_midi2 }, }; static const struct seq_ev_to_ump *find_ump_encoder(int type) From 08f0fa5d6aa9488f752eb5410e32636f143b3d8e Mon Sep 17 00:00:00 2001 From: Joao Paulo Goncalves Date: Tue, 7 May 2024 11:35:55 -0300 Subject: [PATCH 012/272] arm64: dts: freescale: imx8mm-verdin: Fix GPU speed The GPU clock was reduced on iMX8MM SOC device tree to prevent boards that don't support GPU overdrive from being out of specification. However, this caused a regression in GPU speed for the Verdin iMX8MM, which does support GPU overdrive. This patch fixes this by enabling overdrive mode in the SOM dtsi. Fixes: 1f794d3eed53 ("arm64: dts: imx8mm: Reduce GPU to nominal speed") Signed-off-by: Joao Paulo Goncalves Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi index 4768b05fd7659..0d9abca588218 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi @@ -6,6 +6,7 @@ #include #include #include "imx8mm.dtsi" +#include "imx8mm-overdrive.dtsi" / { chosen { From e2d8ea0a066a6db51f31efd2710057271d685d2e Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 29 Apr 2024 00:49:35 +0000 Subject: [PATCH 013/272] soundwire: fix usages of device_get_named_child_node() The documentation for device_get_named_child_node() mentions this important point: " The caller is responsible for calling fwnode_handle_put() on the returned fwnode pointer. " Add fwnode_handle_put() to avoid leaked references. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20240429004935.2400191-1-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- drivers/soundwire/amd_manager.c | 3 +++ drivers/soundwire/intel_auxdevice.c | 6 +++++- drivers/soundwire/mipi_disco.c | 30 +++++++++++++++++++++++------ 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/drivers/soundwire/amd_manager.c b/drivers/soundwire/amd_manager.c index 20d94bcfc9b4f..795e223f7e5c2 100644 --- a/drivers/soundwire/amd_manager.c +++ b/drivers/soundwire/amd_manager.c @@ -571,6 +571,9 @@ static int sdw_master_read_amd_prop(struct sdw_bus *bus) amd_manager->wake_en_mask = wake_en_mask; fwnode_property_read_u32(link, "amd-sdw-power-mode", &power_mode_mask); amd_manager->power_mode_mask = power_mode_mask; + + fwnode_handle_put(link); + return 0; } diff --git a/drivers/soundwire/intel_auxdevice.c b/drivers/soundwire/intel_auxdevice.c index 17cf27e6ea738..18517121cc898 100644 --- a/drivers/soundwire/intel_auxdevice.c +++ b/drivers/soundwire/intel_auxdevice.c @@ -155,8 +155,10 @@ static int sdw_master_read_intel_prop(struct sdw_bus *bus) SDW_MASTER_QUIRKS_CLEAR_INITIAL_PARITY; intel_prop = devm_kzalloc(bus->dev, sizeof(*intel_prop), GFP_KERNEL); - if (!intel_prop) + if (!intel_prop) { + fwnode_handle_put(link); return -ENOMEM; + } /* initialize with hardware defaults, in case the properties are not found */ intel_prop->doaise = 0x1; @@ -184,6 +186,8 @@ static int sdw_master_read_intel_prop(struct sdw_bus *bus) intel_prop->dodse, intel_prop->dods); + fwnode_handle_put(link); + return 0; } diff --git a/drivers/soundwire/mipi_disco.c b/drivers/soundwire/mipi_disco.c index 55a9c51c84c19..e5d9df26d4dc9 100644 --- a/drivers/soundwire/mipi_disco.c +++ b/drivers/soundwire/mipi_disco.c @@ -66,8 +66,10 @@ int sdw_master_read_prop(struct sdw_bus *bus) prop->clk_freq = devm_kcalloc(bus->dev, prop->num_clk_freq, sizeof(*prop->clk_freq), GFP_KERNEL); - if (!prop->clk_freq) + if (!prop->clk_freq) { + fwnode_handle_put(link); return -ENOMEM; + } fwnode_property_read_u32_array(link, "mipi-sdw-clock-frequencies-supported", @@ -92,8 +94,10 @@ int sdw_master_read_prop(struct sdw_bus *bus) prop->clk_gears = devm_kcalloc(bus->dev, prop->num_clk_gears, sizeof(*prop->clk_gears), GFP_KERNEL); - if (!prop->clk_gears) + if (!prop->clk_gears) { + fwnode_handle_put(link); return -ENOMEM; + } fwnode_property_read_u32_array(link, "mipi-sdw-supported-clock-gears", @@ -116,6 +120,8 @@ int sdw_master_read_prop(struct sdw_bus *bus) fwnode_property_read_u32(link, "mipi-sdw-command-error-threshold", &prop->err_threshold); + fwnode_handle_put(link); + return 0; } EXPORT_SYMBOL(sdw_master_read_prop); @@ -197,8 +203,10 @@ static int sdw_slave_read_dpn(struct sdw_slave *slave, dpn[i].num_words, sizeof(*dpn[i].words), GFP_KERNEL); - if (!dpn[i].words) + if (!dpn[i].words) { + fwnode_handle_put(node); return -ENOMEM; + } fwnode_property_read_u32_array(node, "mipi-sdw-port-wordlength-configs", @@ -236,8 +244,10 @@ static int sdw_slave_read_dpn(struct sdw_slave *slave, dpn[i].num_channels, sizeof(*dpn[i].channels), GFP_KERNEL); - if (!dpn[i].channels) + if (!dpn[i].channels) { + fwnode_handle_put(node); return -ENOMEM; + } fwnode_property_read_u32_array(node, "mipi-sdw-channel-number-list", @@ -251,8 +261,10 @@ static int sdw_slave_read_dpn(struct sdw_slave *slave, dpn[i].num_ch_combinations, sizeof(*dpn[i].ch_combinations), GFP_KERNEL); - if (!dpn[i].ch_combinations) + if (!dpn[i].ch_combinations) { + fwnode_handle_put(node); return -ENOMEM; + } fwnode_property_read_u32_array(node, "mipi-sdw-channel-combination-list", @@ -274,6 +286,8 @@ static int sdw_slave_read_dpn(struct sdw_slave *slave, /* TODO: Read audio mode */ + fwnode_handle_put(node); + i++; } @@ -348,10 +362,14 @@ int sdw_slave_read_prop(struct sdw_slave *slave) prop->dp0_prop = devm_kzalloc(&slave->dev, sizeof(*prop->dp0_prop), GFP_KERNEL); - if (!prop->dp0_prop) + if (!prop->dp0_prop) { + fwnode_handle_put(port); return -ENOMEM; + } sdw_slave_read_dp0(slave, port, prop->dp0_prop); + + fwnode_handle_put(port); } /* From 5314e84c33e7ad61df5203df540626ac59f9dcd9 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Mon, 27 May 2024 10:20:35 +0300 Subject: [PATCH 014/272] phy: qcom-qmp: qserdes-txrx: Add missing registers offsets Currently, the x1e80100 uses pure V6 register offsets for DP part of the combo PHY. This hasn't been an issue because external DP is not yet enabled on any of the boards yet. But in order to enabled it, all these new V6 N4 register offsets are needed. So add them. Fixes: 762c3565f3c8 ("phy: qcom-qmp: qserdes-txrx: Add V6 N4 register offsets") Co-developed-by: Kuogee Hsieh Signed-off-by: Kuogee Hsieh Signed-off-by: Abel Vesa Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20240527-x1e80100-phy-qualcomm-combo-fix-dp-v1-1-be8a0b882117@linaro.org Signed-off-by: Vinod Koul --- .../phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_n4.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_n4.h b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_n4.h index a814ad11af071..d37cc0d4fd365 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_n4.h +++ b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_n4.h @@ -6,11 +6,24 @@ #ifndef QCOM_PHY_QMP_QSERDES_TXRX_V6_N4_H_ #define QCOM_PHY_QMP_QSERDES_TXRX_V6_N4_H_ +#define QSERDES_V6_N4_TX_CLKBUF_ENABLE 0x08 +#define QSERDES_V6_N4_TX_TX_EMP_POST1_LVL 0x0c +#define QSERDES_V6_N4_TX_TX_DRV_LVL 0x14 +#define QSERDES_V6_N4_TX_RESET_TSYNC_EN 0x1c +#define QSERDES_V6_N4_TX_PRE_STALL_LDO_BOOST_EN 0x20 #define QSERDES_V6_N4_TX_RES_CODE_LANE_OFFSET_TX 0x30 #define QSERDES_V6_N4_TX_RES_CODE_LANE_OFFSET_RX 0x34 +#define QSERDES_V6_N4_TX_TRANSCEIVER_BIAS_EN 0x48 +#define QSERDES_V6_N4_TX_HIGHZ_DRVR_EN 0x4c +#define QSERDES_V6_N4_TX_TX_POL_INV 0x50 +#define QSERDES_V6_N4_TX_PARRATE_REC_DETECT_IDLE_EN 0x54 #define QSERDES_V6_N4_TX_LANE_MODE_1 0x78 #define QSERDES_V6_N4_TX_LANE_MODE_2 0x7c #define QSERDES_V6_N4_TX_LANE_MODE_3 0x80 +#define QSERDES_V6_N4_TX_TRAN_DRVR_EMP_EN 0xac +#define QSERDES_V6_N4_TX_TX_BAND 0xd8 +#define QSERDES_V6_N4_TX_INTERFACE_SELECT 0xe4 +#define QSERDES_V6_N4_TX_VMODE_CTRL1 0xb0 #define QSERDES_V6_N4_RX_UCDR_FO_GAIN_RATE2 0x8 #define QSERDES_V6_N4_RX_UCDR_SO_GAIN_RATE2 0x18 From 99bf89626335bbec71d8461f0faec88551440850 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Mon, 27 May 2024 10:20:36 +0300 Subject: [PATCH 015/272] phy: qcom-qmp: pcs: Add missing v6 N4 register offsets The new X1E80100 SoC bumps up the HW version of QMP phy to v6 N4 for combo USB and DP PHY. Currently, the X1E80100 uses the pure V6 PCS register offsets, which are different. Add the offsets so the mentioned platform can be fixed later on. Add the new PCS offsets in a dedicated header file. Fixes: d7b3579f84f7 ("phy: qcom-qmp-combo: Add x1e80100 USB/DP combo phys") Co-developed-by: Kuogee Hsieh Signed-off-by: Kuogee Hsieh Signed-off-by: Abel Vesa Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20240527-x1e80100-phy-qualcomm-combo-fix-dp-v1-2-be8a0b882117@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-pcs-v6-n4.h | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-pcs-v6-n4.h diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-pcs-v6-n4.h b/drivers/phy/qualcomm/phy-qcom-qmp-pcs-v6-n4.h new file mode 100644 index 0000000000000..b3024714dab4e --- /dev/null +++ b/drivers/phy/qualcomm/phy-qcom-qmp-pcs-v6-n4.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023, Linaro Limited + */ + +#ifndef QCOM_PHY_QMP_PCS_V6_N4_H_ +#define QCOM_PHY_QMP_PCS_V6_N4_H_ + +/* Only for QMP V6 N4 PHY - USB/PCIe PCS registers */ +#define QPHY_V6_N4_PCS_SW_RESET 0x000 +#define QPHY_V6_N4_PCS_PCS_STATUS1 0x014 +#define QPHY_V6_N4_PCS_POWER_DOWN_CONTROL 0x040 +#define QPHY_V6_N4_PCS_START_CONTROL 0x044 +#define QPHY_V6_N4_PCS_POWER_STATE_CONFIG1 0x090 +#define QPHY_V6_N4_PCS_LOCK_DETECT_CONFIG1 0x0c4 +#define QPHY_V6_N4_PCS_LOCK_DETECT_CONFIG2 0x0c8 +#define QPHY_V6_N4_PCS_LOCK_DETECT_CONFIG3 0x0cc +#define QPHY_V6_N4_PCS_LOCK_DETECT_CONFIG6 0x0d8 +#define QPHY_V6_N4_PCS_REFGEN_REQ_CONFIG1 0x0dc +#define QPHY_V6_N4_PCS_RX_SIGDET_LVL 0x188 +#define QPHY_V6_N4_PCS_RCVR_DTCT_DLY_P1U2_L 0x190 +#define QPHY_V6_N4_PCS_RCVR_DTCT_DLY_P1U2_H 0x194 +#define QPHY_V6_N4_PCS_RATE_SLEW_CNTRL1 0x198 +#define QPHY_V6_N4_PCS_RX_CONFIG 0x1b0 +#define QPHY_V6_N4_PCS_ALIGN_DETECT_CONFIG1 0x1c0 +#define QPHY_V6_N4_PCS_ALIGN_DETECT_CONFIG2 0x1c4 +#define QPHY_V6_N4_PCS_PCS_TX_RX_CONFIG 0x1d0 +#define QPHY_V6_N4_PCS_EQ_CONFIG1 0x1dc +#define QPHY_V6_N4_PCS_EQ_CONFIG2 0x1e0 +#define QPHY_V6_N4_PCS_EQ_CONFIG5 0x1ec + +#endif From 163c1a356a847ab4767200fd4a45b3f8e4ddc900 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Mon, 27 May 2024 10:20:37 +0300 Subject: [PATCH 016/272] phy: qcom: qmp-combo: Switch from V6 to V6 N4 register offsets Currently, none of the X1E80100 supported boards upstream have enabled DP. As for USB, the reason it is not broken when it's obvious that the offsets are wrong is because the only difference with respect to USB is the difference in register name. The V6 uses QPHY_V6_PCS_CDR_RESET_TIME while V6 N4 uses QPHY_V6_N4_PCS_RX_CONFIG. Now, in order for the DP to work, the DP serdes tables need to be added as they have different values for V6 N4 when compared to V6 ones, even though they use the same V6 offsets. While at it, switch swing and pre-emphasis tables to V6 as well. Fixes: d7b3579f84f7 ("phy: qcom-qmp-combo: Add x1e80100 USB/DP combo phys") Co-developed-by: Kuogee Hsieh Signed-off-by: Kuogee Hsieh Signed-off-by: Abel Vesa Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20240527-x1e80100-phy-qualcomm-combo-fix-dp-v1-3-be8a0b882117@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-combo.c | 189 ++++++++++++++++++---- drivers/phy/qualcomm/phy-qcom-qmp.h | 2 + 2 files changed, 162 insertions(+), 29 deletions(-) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c index 7f999e8a433d8..7b00945f7191d 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c @@ -187,6 +187,31 @@ static const unsigned int qmp_v6_usb3phy_regs_layout[QPHY_LAYOUT_SIZE] = { [QPHY_TX_TRANSCEIVER_BIAS_EN] = QSERDES_V6_TX_TRANSCEIVER_BIAS_EN, }; +static const unsigned int qmp_v6_n4_usb3phy_regs_layout[QPHY_LAYOUT_SIZE] = { + [QPHY_SW_RESET] = QPHY_V6_N4_PCS_SW_RESET, + [QPHY_START_CTRL] = QPHY_V6_N4_PCS_START_CONTROL, + [QPHY_PCS_STATUS] = QPHY_V6_N4_PCS_PCS_STATUS1, + [QPHY_PCS_POWER_DOWN_CONTROL] = QPHY_V6_N4_PCS_POWER_DOWN_CONTROL, + + /* In PCS_USB */ + [QPHY_PCS_AUTONOMOUS_MODE_CTRL] = QPHY_V6_PCS_USB3_AUTONOMOUS_MODE_CTRL, + [QPHY_PCS_LFPS_RXTERM_IRQ_CLEAR] = QPHY_V6_PCS_USB3_LFPS_RXTERM_IRQ_CLEAR, + + [QPHY_COM_RESETSM_CNTRL] = QSERDES_V6_COM_RESETSM_CNTRL, + [QPHY_COM_C_READY_STATUS] = QSERDES_V6_COM_C_READY_STATUS, + [QPHY_COM_CMN_STATUS] = QSERDES_V6_COM_CMN_STATUS, + [QPHY_COM_BIAS_EN_CLKBUFLR_EN] = QSERDES_V6_COM_PLL_BIAS_EN_CLK_BUFLR_EN, + + [QPHY_DP_PHY_STATUS] = QSERDES_V6_DP_PHY_STATUS, + [QPHY_DP_PHY_VCO_DIV] = QSERDES_V6_DP_PHY_VCO_DIV, + + [QPHY_TX_TX_POL_INV] = QSERDES_V6_N4_TX_TX_POL_INV, + [QPHY_TX_TX_DRV_LVL] = QSERDES_V6_N4_TX_TX_DRV_LVL, + [QPHY_TX_TX_EMP_POST1_LVL] = QSERDES_V6_N4_TX_TX_EMP_POST1_LVL, + [QPHY_TX_HIGHZ_DRVR_EN] = QSERDES_V6_N4_TX_HIGHZ_DRVR_EN, + [QPHY_TX_TRANSCEIVER_BIAS_EN] = QSERDES_V6_N4_TX_TRANSCEIVER_BIAS_EN, +}; + static const struct qmp_phy_init_tbl qmp_v3_usb3_serdes_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V3_COM_PLL_IVCO, 0x07), QMP_PHY_INIT_CFG(QSERDES_V3_COM_SYSCLK_EN_SEL, 0x14), @@ -997,6 +1022,31 @@ static const struct qmp_phy_init_tbl qmp_v6_dp_serdes_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V6_COM_CORE_CLK_EN, 0x0f), }; +static const struct qmp_phy_init_tbl qmp_v6_n4_dp_serdes_tbl[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SVS_MODE_CLK_SEL, 0x15), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SYSCLK_EN_SEL, 0x3b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SYS_CLK_CTRL, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CLK_ENABLE1, 0x0c), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SYSCLK_BUF_ENABLE, 0x06), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CLK_SELECT, 0x30), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_IVCO, 0x07), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_CCTRL_MODE0, 0x36), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_RCTRL_MODE0, 0x16), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CP_CTRL_MODE0, 0x06), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x34), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START1_MODE0, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START2_MODE0, 0xc0), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CMN_CONFIG_1, 0x12), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_INTEGLOOP_GAIN0_MODE0, 0x3f), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_INTEGLOOP_GAIN1_MODE0, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE_MAP, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BG_TIMER, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_CORE_CLK_DIV_MODE0, 0x14), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE_CTRL, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_BIAS_EN_CLK_BUFLR_EN, 0x17), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CORE_CLK_EN, 0x0f), +}; + static const struct qmp_phy_init_tbl qmp_v6_dp_tx_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V6_TX_VMODE_CTRL1, 0x40), QMP_PHY_INIT_CFG(QSERDES_V6_TX_PRE_STALL_LDO_BOOST_EN, 0x30), @@ -1011,6 +1061,19 @@ static const struct qmp_phy_init_tbl qmp_v6_dp_tx_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V6_TX_TX_BAND, 0x4), }; +static const struct qmp_phy_init_tbl qmp_v6_n4_dp_tx_tbl[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_VMODE_CTRL1, 0x40), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_PRE_STALL_LDO_BOOST_EN, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_INTERFACE_SELECT, 0xff), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_CLKBUF_ENABLE, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_RESET_TSYNC_EN, 0x03), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_TRAN_DRVR_EMP_EN, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_PARRATE_REC_DETECT_IDLE_EN, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_RES_CODE_LANE_OFFSET_TX, 0x11), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_RES_CODE_LANE_OFFSET_RX, 0x11), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_TX_BAND, 0x1), +}; + static const struct qmp_phy_init_tbl qmp_v6_dp_serdes_tbl_rbr[] = { QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_SEL_1, 0x05), QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x34), @@ -1059,6 +1122,74 @@ static const struct qmp_phy_init_tbl qmp_v6_dp_serdes_tbl_hbr3[] = { QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE2_MODE0, 0x0c), }; +static const struct qmp_phy_init_tbl qmp_v6_n4_dp_serdes_tbl_rbr[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_SEL_1, 0x05), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x34), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP_EN, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START3_MODE0, 0x0b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE0, 0x37), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE0, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE1_MODE0, 0x71), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE2_MODE0, 0x0c), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_EN_CENTER, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_ADJ_PER1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER1, 0x6b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER2, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE1_MODE0, 0x92), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE2_MODE0, 0x01), +}; + +static const struct qmp_phy_init_tbl qmp_v6_n4_dp_serdes_tbl_hbr[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_SEL_1, 0x03), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x34), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP_EN, 0x08), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START3_MODE0, 0x0b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE0, 0x07), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE0, 0x07), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE1_MODE0, 0x71), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE2_MODE0, 0x0c), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_EN_CENTER, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_ADJ_PER1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER1, 0x6b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER2, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE1_MODE0, 0x92), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE2_MODE0, 0x01), +}; + +static const struct qmp_phy_init_tbl qmp_v6_n4_dp_serdes_tbl_hbr2[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_SEL_1, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x46), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP_EN, 0x08), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START3_MODE0, 0x05), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE0, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE0, 0x0e), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE1_MODE0, 0x97), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE2_MODE0, 0x10), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_EN_CENTER, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_ADJ_PER1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER1, 0x6b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER2, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE1_MODE0, 0x18), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE2_MODE0, 0x02), +}; + +static const struct qmp_phy_init_tbl qmp_v6_n4_dp_serdes_tbl_hbr3[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_SEL_1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x34), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP_EN, 0x08), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START3_MODE0, 0x0b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE0, 0x17), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE0, 0x15), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE1_MODE0, 0x71), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE2_MODE0, 0x0c), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_EN_CENTER, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_ADJ_PER1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER1, 0x6b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER2, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE1_MODE0, 0x92), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE2_MODE0, 0x01), +}; + static const struct qmp_phy_init_tbl sc8280xp_usb43dp_serdes_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V5_COM_SSC_EN_CENTER, 0x01), QMP_PHY_INIT_CFG(QSERDES_V5_COM_SSC_PER1, 0x31), @@ -1273,20 +1404,20 @@ static const struct qmp_phy_init_tbl x1e80100_usb43dp_rx_tbl[] = { }; static const struct qmp_phy_init_tbl x1e80100_usb43dp_pcs_tbl[] = { - QMP_PHY_INIT_CFG(QPHY_V6_PCS_RCVR_DTCT_DLY_P1U2_L, 0xe7), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_RCVR_DTCT_DLY_P1U2_H, 0x03), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_LOCK_DETECT_CONFIG1, 0xc4), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_LOCK_DETECT_CONFIG2, 0x89), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_LOCK_DETECT_CONFIG3, 0x20), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_LOCK_DETECT_CONFIG6, 0x13), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_REFGEN_REQ_CONFIG1, 0x21), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_RX_SIGDET_LVL, 0x55), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_CDR_RESET_TIME, 0x0a), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_ALIGN_DETECT_CONFIG1, 0xd4), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_ALIGN_DETECT_CONFIG2, 0x30), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_PCS_TX_RX_CONFIG, 0x0c), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_EQ_CONFIG1, 0x4b), - QMP_PHY_INIT_CFG(QPHY_V6_PCS_EQ_CONFIG5, 0x10), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_RCVR_DTCT_DLY_P1U2_L, 0xe7), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_RCVR_DTCT_DLY_P1U2_H, 0x03), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_LOCK_DETECT_CONFIG1, 0xc4), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_LOCK_DETECT_CONFIG2, 0x89), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_LOCK_DETECT_CONFIG3, 0x20), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_LOCK_DETECT_CONFIG6, 0x13), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_REFGEN_REQ_CONFIG1, 0x21), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_RX_SIGDET_LVL, 0x55), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_RX_CONFIG, 0x0a), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_ALIGN_DETECT_CONFIG1, 0xd4), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_ALIGN_DETECT_CONFIG2, 0x30), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_PCS_TX_RX_CONFIG, 0x0c), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_EQ_CONFIG1, 0x4b), + QMP_PHY_INIT_CFG(QPHY_V6_N4_PCS_EQ_CONFIG5, 0x10), }; static const struct qmp_phy_init_tbl x1e80100_usb43dp_pcs_usb_tbl[] = { @@ -1794,22 +1925,22 @@ static const struct qmp_phy_cfg x1e80100_usb3dpphy_cfg = { .pcs_usb_tbl = x1e80100_usb43dp_pcs_usb_tbl, .pcs_usb_tbl_num = ARRAY_SIZE(x1e80100_usb43dp_pcs_usb_tbl), - .dp_serdes_tbl = qmp_v6_dp_serdes_tbl, - .dp_serdes_tbl_num = ARRAY_SIZE(qmp_v6_dp_serdes_tbl), - .dp_tx_tbl = qmp_v6_dp_tx_tbl, - .dp_tx_tbl_num = ARRAY_SIZE(qmp_v6_dp_tx_tbl), + .dp_serdes_tbl = qmp_v6_n4_dp_serdes_tbl, + .dp_serdes_tbl_num = ARRAY_SIZE(qmp_v6_n4_dp_serdes_tbl), + .dp_tx_tbl = qmp_v6_n4_dp_tx_tbl, + .dp_tx_tbl_num = ARRAY_SIZE(qmp_v6_n4_dp_tx_tbl), - .serdes_tbl_rbr = qmp_v6_dp_serdes_tbl_rbr, - .serdes_tbl_rbr_num = ARRAY_SIZE(qmp_v6_dp_serdes_tbl_rbr), - .serdes_tbl_hbr = qmp_v6_dp_serdes_tbl_hbr, - .serdes_tbl_hbr_num = ARRAY_SIZE(qmp_v6_dp_serdes_tbl_hbr), - .serdes_tbl_hbr2 = qmp_v6_dp_serdes_tbl_hbr2, - .serdes_tbl_hbr2_num = ARRAY_SIZE(qmp_v6_dp_serdes_tbl_hbr2), - .serdes_tbl_hbr3 = qmp_v6_dp_serdes_tbl_hbr3, - .serdes_tbl_hbr3_num = ARRAY_SIZE(qmp_v6_dp_serdes_tbl_hbr3), + .serdes_tbl_rbr = qmp_v6_n4_dp_serdes_tbl_rbr, + .serdes_tbl_rbr_num = ARRAY_SIZE(qmp_v6_n4_dp_serdes_tbl_rbr), + .serdes_tbl_hbr = qmp_v6_n4_dp_serdes_tbl_hbr, + .serdes_tbl_hbr_num = ARRAY_SIZE(qmp_v6_n4_dp_serdes_tbl_hbr), + .serdes_tbl_hbr2 = qmp_v6_n4_dp_serdes_tbl_hbr2, + .serdes_tbl_hbr2_num = ARRAY_SIZE(qmp_v6_n4_dp_serdes_tbl_hbr2), + .serdes_tbl_hbr3 = qmp_v6_n4_dp_serdes_tbl_hbr3, + .serdes_tbl_hbr3_num = ARRAY_SIZE(qmp_v6_n4_dp_serdes_tbl_hbr3), - .swing_hbr_rbr = &qmp_dp_v5_voltage_swing_hbr_rbr, - .pre_emphasis_hbr_rbr = &qmp_dp_v5_pre_emphasis_hbr_rbr, + .swing_hbr_rbr = &qmp_dp_v6_voltage_swing_hbr_rbr, + .pre_emphasis_hbr_rbr = &qmp_dp_v6_pre_emphasis_hbr_rbr, .swing_hbr3_hbr2 = &qmp_dp_v5_voltage_swing_hbr3_hbr2, .pre_emphasis_hbr3_hbr2 = &qmp_dp_v5_pre_emphasis_hbr3_hbr2, @@ -1822,7 +1953,7 @@ static const struct qmp_phy_cfg x1e80100_usb3dpphy_cfg = { .num_resets = ARRAY_SIZE(msm8996_usb3phy_reset_l), .vreg_list = qmp_phy_vreg_l, .num_vregs = ARRAY_SIZE(qmp_phy_vreg_l), - .regs = qmp_v45_usb3phy_regs_layout, + .regs = qmp_v6_n4_usb3phy_regs_layout, }; static const struct qmp_phy_cfg sm6350_usb3dpphy_cfg = { diff --git a/drivers/phy/qualcomm/phy-qcom-qmp.h b/drivers/phy/qualcomm/phy-qcom-qmp.h index d10b8f653c4b2..d0f41e4aaa855 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp.h +++ b/drivers/phy/qualcomm/phy-qcom-qmp.h @@ -46,6 +46,8 @@ #include "phy-qcom-qmp-pcs-v6.h" +#include "phy-qcom-qmp-pcs-v6-n4.h" + #include "phy-qcom-qmp-pcs-v6_20.h" #include "phy-qcom-qmp-pcs-v7.h" From 8141b6da1763b9db009e5dcf873869bb31bcef45 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 3 Jun 2024 19:01:00 +0200 Subject: [PATCH 017/272] regulator: tps6594-regulator: Fix the number of irqs for TPS65224 and TPS6594 The number of irqs is computed to allocate the right amount of memory for the irq data. An array of struct tps6594_regulator_irq_data is allocated one time for all the irqs. Each irq uses one cell of the array. If the computed number of irqs is not correct, not allocated memory could be used. Fix the values used in the calculation for TPS6594 and TPS65224. Fixes: 00c826525fba (regulator: tps6594-regulator: Add TI TPS65224 PMIC regulators) Signed-off-by: Thomas Richard Tested-by: Nishanth Menon Link: https://msgid.link/r/20240603170100.2394402-1-thomas.richard@bootlin.com Signed-off-by: Mark Brown --- drivers/regulator/tps6594-regulator.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/regulator/tps6594-regulator.c b/drivers/regulator/tps6594-regulator.c index 4a859f4c0f835..ac53792e3fede 100644 --- a/drivers/regulator/tps6594-regulator.c +++ b/drivers/regulator/tps6594-regulator.c @@ -653,18 +653,14 @@ static int tps6594_regulator_probe(struct platform_device *pdev) } } - if (tps->chip_id == LP8764) { - nr_buck = ARRAY_SIZE(buck_regs); - nr_ldo = 0; - nr_types = REGS_INT_NB; - } else if (tps->chip_id == TPS65224) { + if (tps->chip_id == TPS65224) { nr_buck = ARRAY_SIZE(tps65224_buck_regs); nr_ldo = ARRAY_SIZE(tps65224_ldo_regs); - nr_types = REGS_INT_NB; + nr_types = TPS65224_REGS_INT_NB; } else { nr_buck = ARRAY_SIZE(buck_regs); - nr_ldo = ARRAY_SIZE(tps6594_ldo_regs); - nr_types = TPS65224_REGS_INT_NB; + nr_ldo = (tps->chip_id == LP8764) ? 0 : ARRAY_SIZE(tps6594_ldo_regs); + nr_types = REGS_INT_NB; } reg_irq_nb = nr_types * (nr_buck + nr_ldo); From d9fef76e89498bf99cdb03f77b7091d7e95d7edd Mon Sep 17 00:00:00 2001 From: Julien Panis Date: Thu, 16 May 2024 12:44:37 +0200 Subject: [PATCH 018/272] thermal/drivers/mediatek/lvts_thermal: Remove filtered mode for mt8188 Filtered mode is not supported on mt8188 SoC and is the source of bad results. Move to immediate mode which provides good temperatures. Fixes: f4745f546e60 ("thermal/drivers/mediatek/lvts_thermal: Add MT8188 support") Reviewed-by: Nicolas Pitre Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Julien Panis Link: https://lore.kernel.org/r/20240516-mtk-thermal-mt8188-mode-fix-v2-1-40a317442c62@baylibre.com Signed-off-by: Daniel Lezcano --- drivers/thermal/mediatek/lvts_thermal.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/thermal/mediatek/lvts_thermal.c b/drivers/thermal/mediatek/lvts_thermal.c index 0bb3a495b56ed..82c355c466cfe 100644 --- a/drivers/thermal/mediatek/lvts_thermal.c +++ b/drivers/thermal/mediatek/lvts_thermal.c @@ -1458,7 +1458,6 @@ static const struct lvts_ctrl_data mt8188_lvts_mcu_data_ctrl[] = { }, VALID_SENSOR_MAP(1, 1, 1, 1), .offset = 0x0, - .mode = LVTS_MSR_FILTERED_MODE, }, { .lvts_sensor = { @@ -1469,7 +1468,6 @@ static const struct lvts_ctrl_data mt8188_lvts_mcu_data_ctrl[] = { }, VALID_SENSOR_MAP(1, 1, 0, 0), .offset = 0x100, - .mode = LVTS_MSR_FILTERED_MODE, } }; @@ -1483,7 +1481,6 @@ static const struct lvts_ctrl_data mt8188_lvts_ap_data_ctrl[] = { }, VALID_SENSOR_MAP(0, 1, 0, 0), .offset = 0x0, - .mode = LVTS_MSR_FILTERED_MODE, }, { .lvts_sensor = { @@ -1496,7 +1493,6 @@ static const struct lvts_ctrl_data mt8188_lvts_ap_data_ctrl[] = { }, VALID_SENSOR_MAP(1, 1, 1, 0), .offset = 0x100, - .mode = LVTS_MSR_FILTERED_MODE, }, { .lvts_sensor = { @@ -1507,7 +1503,6 @@ static const struct lvts_ctrl_data mt8188_lvts_ap_data_ctrl[] = { }, VALID_SENSOR_MAP(1, 1, 0, 0), .offset = 0x200, - .mode = LVTS_MSR_FILTERED_MODE, }, { .lvts_sensor = { @@ -1518,7 +1513,6 @@ static const struct lvts_ctrl_data mt8188_lvts_ap_data_ctrl[] = { }, VALID_SENSOR_MAP(1, 1, 0, 0), .offset = 0x300, - .mode = LVTS_MSR_FILTERED_MODE, } }; From 4eecb644b8b82f5279a348f6ebe77e3d6e5b1b05 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 4 Jun 2024 14:17:04 +0100 Subject: [PATCH 019/272] spi: cs42l43: Correct SPI root clock speed The root clock is actually 49.152MHz not 40MHz, as it is derived from the primary audio clock, update the driver to match. This error can cause the actual clock rate to be higher than the requested clock rate on the SPI bus. Fixes: ef75e767167a ("spi: cs42l43: Add SPI controller support") Signed-off-by: Charles Keepax Link: https://msgid.link/r/20240604131704.3227500-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- drivers/spi/spi-cs42l43.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-cs42l43.c b/drivers/spi/spi-cs42l43.c index 9d747ea699264..902a0734cc361 100644 --- a/drivers/spi/spi-cs42l43.c +++ b/drivers/spi/spi-cs42l43.c @@ -26,7 +26,7 @@ #include #define CS42L43_FIFO_SIZE 16 -#define CS42L43_SPI_ROOT_HZ (40 * HZ_PER_MHZ) +#define CS42L43_SPI_ROOT_HZ 49152000 #define CS42L43_SPI_MAX_LENGTH 65532 enum cs42l43_spi_cmd { From d21b3c60d6e3917f7388db6bcc455f01c99ee42b Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 23 May 2024 16:41:02 +0100 Subject: [PATCH 020/272] KVM: selftests: Fix shift of 32 bit unsigned int more than 32 bits Currrentl a 32 bit 1u value is being shifted more than 32 bits causing overflow and incorrect checking of bits 32-63. Fix this by using the BIT_ULL macro for shifting bits. Detected by cppcheck: sev_init2_tests.c:108:34: error: Shifting 32-bit value by 63 bits is undefined behaviour [shiftTooManyBits] Fixes: dfc083a181ba ("selftests: kvm: add tests for KVM_SEV_INIT2") Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20240523154102.2236133-1-colin.i.king@gmail.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/sev_init2_tests.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c index 7a4a61be119b1..3fb967f40c6a1 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c @@ -105,11 +105,11 @@ void test_features(uint32_t vm_type, uint64_t supported_features) int i; for (i = 0; i < 64; i++) { - if (!(supported_features & (1u << i))) + if (!(supported_features & BIT_ULL(i))) test_init2_invalid(vm_type, &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) }, "unknown feature"); - else if (KNOWN_FEATURES & (1u << i)) + else if (KNOWN_FEATURES & BIT_ULL(i)) test_init2(vm_type, &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) }); } From 980b8bc01938c8bcc9742c1051f64b5f0ed178ac Mon Sep 17 00:00:00 2001 From: Tao Su Date: Mon, 13 May 2024 09:40:03 +0800 Subject: [PATCH 021/272] KVM: selftests: x86: Prioritize getting max_gfn from GuestPhysBits Use the max mappable GPA via GuestPhysBits advertised by KVM to calculate max_gfn. Currently some selftests (e.g. access_tracking_perf_test, dirty_log_test...) add RAM regions close to max_gfn, so guest may access GPA beyond its mappable range and cause infinite loop. Adjust max_gfn in vm_compute_max_gfn() since x86 selftests already overrides vm_compute_max_gfn() specifically to deal with goofy edge cases. Reported-by: Yi Lai Signed-off-by: Tao Su Tested-by: Yi Lai Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240513014003.104593-1-tao1.su@linux.intel.com [sean: tweak name, add comment and sanity check] Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86_64/processor.h | 1 + .../testing/selftests/kvm/lib/x86_64/processor.c | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 8eb57de0b5876..c0c7c1fe93f98 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -277,6 +277,7 @@ struct kvm_x86_cpu_property { #define X86_PROPERTY_MAX_EXT_LEAF KVM_X86_CPU_PROPERTY(0x80000000, 0, EAX, 0, 31) #define X86_PROPERTY_MAX_PHY_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 0, 7) #define X86_PROPERTY_MAX_VIRT_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 8, 15) +#define X86_PROPERTY_GUEST_MAX_PHY_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 16, 23) #define X86_PROPERTY_SEV_C_BIT KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 0, 5) #define X86_PROPERTY_PHYS_ADDR_REDUCTION KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 6, 11) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index c664e446136bc..594b061aef521 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1247,9 +1247,20 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm) { const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */ unsigned long ht_gfn, max_gfn, max_pfn; - uint8_t maxphyaddr; + uint8_t maxphyaddr, guest_maxphyaddr; - max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1; + /* + * Use "guest MAXPHYADDR" from KVM if it's available. Guest MAXPHYADDR + * enumerates the max _mappable_ GPA, which can be less than the raw + * MAXPHYADDR, e.g. if MAXPHYADDR=52, KVM is using TDP, and the CPU + * doesn't support 5-level TDP. + */ + guest_maxphyaddr = kvm_cpu_property(X86_PROPERTY_GUEST_MAX_PHY_ADDR); + guest_maxphyaddr = guest_maxphyaddr ?: vm->pa_bits; + TEST_ASSERT(guest_maxphyaddr <= vm->pa_bits, + "Guest MAXPHYADDR should never be greater than raw MAXPHYADDR"); + + max_gfn = (1ULL << (guest_maxphyaddr - vm->page_shift)) - 1; /* Avoid reserved HyperTransport region on AMD processors. */ if (!host_cpu_is_amd) From 49f683b41f28918df3e51ddc0d928cb2e934ccdb Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 10 May 2024 02:23:52 -0700 Subject: [PATCH 022/272] KVM: Fix a data race on last_boosted_vcpu in kvm_vcpu_on_spin() Use {READ,WRITE}_ONCE() to access kvm->last_boosted_vcpu to ensure the loads and stores are atomic. In the extremely unlikely scenario the compiler tears the stores, it's theoretically possible for KVM to attempt to get a vCPU using an out-of-bounds index, e.g. if the write is split into multiple 8-bit stores, and is paired with a 32-bit load on a VM with 257 vCPUs: CPU0 CPU1 last_boosted_vcpu = 0xff; (last_boosted_vcpu = 0x100) last_boosted_vcpu[15:8] = 0x01; i = (last_boosted_vcpu = 0x1ff) last_boosted_vcpu[7:0] = 0x00; vcpu = kvm->vcpu_array[0x1ff]; As detected by KCSAN: BUG: KCSAN: data-race in kvm_vcpu_on_spin [kvm] / kvm_vcpu_on_spin [kvm] write to 0xffffc90025a92344 of 4 bytes by task 4340 on cpu 16: kvm_vcpu_on_spin (arch/x86/kvm/../../../virt/kvm/kvm_main.c:4112) kvm handle_pause (arch/x86/kvm/vmx/vmx.c:5929) kvm_intel vmx_handle_exit (arch/x86/kvm/vmx/vmx.c:? arch/x86/kvm/vmx/vmx.c:6606) kvm_intel vcpu_run (arch/x86/kvm/x86.c:11107 arch/x86/kvm/x86.c:11211) kvm kvm_arch_vcpu_ioctl_run (arch/x86/kvm/x86.c:?) kvm kvm_vcpu_ioctl (arch/x86/kvm/../../../virt/kvm/kvm_main.c:?) kvm __se_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:904 fs/ioctl.c:890) __x64_sys_ioctl (fs/ioctl.c:890) x64_sys_call (arch/x86/entry/syscall_64.c:33) do_syscall_64 (arch/x86/entry/common.c:?) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) read to 0xffffc90025a92344 of 4 bytes by task 4342 on cpu 4: kvm_vcpu_on_spin (arch/x86/kvm/../../../virt/kvm/kvm_main.c:4069) kvm handle_pause (arch/x86/kvm/vmx/vmx.c:5929) kvm_intel vmx_handle_exit (arch/x86/kvm/vmx/vmx.c:? arch/x86/kvm/vmx/vmx.c:6606) kvm_intel vcpu_run (arch/x86/kvm/x86.c:11107 arch/x86/kvm/x86.c:11211) kvm kvm_arch_vcpu_ioctl_run (arch/x86/kvm/x86.c:?) kvm kvm_vcpu_ioctl (arch/x86/kvm/../../../virt/kvm/kvm_main.c:?) kvm __se_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:904 fs/ioctl.c:890) __x64_sys_ioctl (fs/ioctl.c:890) x64_sys_call (arch/x86/entry/syscall_64.c:33) do_syscall_64 (arch/x86/entry/common.c:?) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) value changed: 0x00000012 -> 0x00000000 Fixes: 217ece6129f2 ("KVM: use yield_to instead of sleep in kvm_vcpu_on_spin") Cc: stable@vger.kernel.org Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240510092353.2261824-1-leitao@debian.org Signed-off-by: Sean Christopherson --- virt/kvm/kvm_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 14841acb8b959..843aa68cbcd05 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4025,12 +4025,13 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) { struct kvm *kvm = me->kvm; struct kvm_vcpu *vcpu; - int last_boosted_vcpu = me->kvm->last_boosted_vcpu; + int last_boosted_vcpu; unsigned long i; int yielded = 0; int try = 3; int pass; + last_boosted_vcpu = READ_ONCE(kvm->last_boosted_vcpu); kvm_vcpu_set_in_spin_loop(me, true); /* * We boost the priority of a VCPU that is runnable but not @@ -4068,7 +4069,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) yielded = kvm_vcpu_yield_to(vcpu); if (yielded > 0) { - kvm->last_boosted_vcpu = i; + WRITE_ONCE(kvm->last_boosted_vcpu, i); break; } else if (yielded < 0) { try--; From 831bcbcead6668ebf20b64fdb27518f1362ace3a Mon Sep 17 00:00:00 2001 From: Aditya Nagesh Date: Fri, 31 May 2024 03:48:41 -0700 Subject: [PATCH 023/272] Drivers: hv: Cosmetic changes for hv.c and balloon.c Fix issues reported by checkpatch.pl script in hv.c and balloon.c - Remove unnecessary parentheses - Remove extra newlines - Remove extra spaces - Add spaces between comparison operators - Remove comparison with NULL in if statements No functional changes intended Signed-off-by: Aditya Nagesh Reviewed-by: Saurabh Sengar Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/1717152521-6439-1-git-send-email-adityanagesh@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1717152521-6439-1-git-send-email-adityanagesh@linux.microsoft.com> --- drivers/hv/hv.c | 37 ++++++++-------- drivers/hv/hv_balloon.c | 98 +++++++++++++++-------------------------- 2 files changed, 53 insertions(+), 82 deletions(-) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index a8ad728354cb0..e0d676c74f147 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -45,8 +45,8 @@ int hv_init(void) * This involves a hypercall. */ int hv_post_message(union hv_connection_id connection_id, - enum hv_message_type message_type, - void *payload, size_t payload_size) + enum hv_message_type message_type, + void *payload, size_t payload_size) { struct hv_input_post_message *aligned_msg; unsigned long flags; @@ -86,7 +86,7 @@ int hv_post_message(union hv_connection_id connection_id, status = HV_STATUS_INVALID_PARAMETER; } else { status = hv_do_hypercall(HVCALL_POST_MESSAGE, - aligned_msg, NULL); + aligned_msg, NULL); } local_irq_restore(flags); @@ -111,7 +111,7 @@ int hv_synic_alloc(void) hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask), GFP_KERNEL); - if (hv_context.hv_numa_map == NULL) { + if (!hv_context.hv_numa_map) { pr_err("Unable to allocate NUMA map\n"); goto err; } @@ -120,11 +120,11 @@ int hv_synic_alloc(void) hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); tasklet_init(&hv_cpu->msg_dpc, - vmbus_on_msg_dpc, (unsigned long) hv_cpu); + vmbus_on_msg_dpc, (unsigned long)hv_cpu); if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC); - if (hv_cpu->post_msg_page == NULL) { + if (!hv_cpu->post_msg_page) { pr_err("Unable to allocate post msg page\n"); goto err; } @@ -147,14 +147,14 @@ int hv_synic_alloc(void) if (!ms_hyperv.paravisor_present && !hv_root_partition) { hv_cpu->synic_message_page = (void *)get_zeroed_page(GFP_ATOMIC); - if (hv_cpu->synic_message_page == NULL) { + if (!hv_cpu->synic_message_page) { pr_err("Unable to allocate SYNIC message page\n"); goto err; } hv_cpu->synic_event_page = (void *)get_zeroed_page(GFP_ATOMIC); - if (hv_cpu->synic_event_page == NULL) { + if (!hv_cpu->synic_event_page) { pr_err("Unable to allocate SYNIC event page\n"); free_page((unsigned long)hv_cpu->synic_message_page); @@ -203,14 +203,13 @@ int hv_synic_alloc(void) return ret; } - void hv_synic_free(void) { int cpu, ret; for_each_present_cpu(cpu) { - struct hv_per_cpu_context *hv_cpu - = per_cpu_ptr(hv_context.cpu_context, cpu); + struct hv_per_cpu_context *hv_cpu = + per_cpu_ptr(hv_context.cpu_context, cpu); /* It's better to leak the page if the encryption fails. */ if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { @@ -262,8 +261,8 @@ void hv_synic_free(void) */ void hv_synic_enable_regs(unsigned int cpu) { - struct hv_per_cpu_context *hv_cpu - = per_cpu_ptr(hv_context.cpu_context, cpu); + struct hv_per_cpu_context *hv_cpu = + per_cpu_ptr(hv_context.cpu_context, cpu); union hv_synic_simp simp; union hv_synic_siefp siefp; union hv_synic_sint shared_sint; @@ -277,8 +276,8 @@ void hv_synic_enable_regs(unsigned int cpu) /* Mask out vTOM bit. ioremap_cache() maps decrypted */ u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; - hv_cpu->synic_message_page - = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); + hv_cpu->synic_message_page = + (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); if (!hv_cpu->synic_message_page) pr_err("Fail to map synic message page.\n"); } else { @@ -296,8 +295,8 @@ void hv_synic_enable_regs(unsigned int cpu) /* Mask out vTOM bit. ioremap_cache() maps decrypted */ u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; - hv_cpu->synic_event_page - = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); + hv_cpu->synic_event_page = + (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); if (!hv_cpu->synic_event_page) pr_err("Fail to map synic event page.\n"); } else { @@ -348,8 +347,8 @@ int hv_synic_init(unsigned int cpu) */ void hv_synic_disable_regs(unsigned int cpu) { - struct hv_per_cpu_context *hv_cpu - = per_cpu_ptr(hv_context.cpu_context, cpu); + struct hv_per_cpu_context *hv_cpu = + per_cpu_ptr(hv_context.cpu_context, cpu); union hv_synic_sint shared_sint; union hv_synic_simp simp; union hv_synic_siefp siefp; diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 4370ad31b5b38..0e7427c2baf58 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -42,8 +42,6 @@ * Begin protocol definitions. */ - - /* * Protocol versions. The low word is the minor version, the high word the major * version. @@ -72,8 +70,6 @@ enum { DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10 }; - - /* * Message Types */ @@ -102,7 +98,6 @@ enum dm_message_type { DM_VERSION_1_MAX = 12 }; - /* * Structures defining the dynamic memory management * protocol. @@ -116,7 +111,6 @@ union dm_version { __u32 version; } __packed; - union dm_caps { struct { __u64 balloon:1; @@ -149,8 +143,6 @@ union dm_mem_page_range { __u64 page_range; } __packed; - - /* * The header for all dynamic memory messages: * @@ -175,7 +167,6 @@ struct dm_message { __u8 data[]; /* enclosed message */ } __packed; - /* * Specific message types supporting the dynamic memory protocol. */ @@ -272,7 +263,6 @@ struct dm_status { __u32 io_diff; } __packed; - /* * Message to ask the guest to allocate memory - balloon up message. * This message is sent from the host to the guest. The guest may not be @@ -287,14 +277,13 @@ struct dm_balloon { __u32 reservedz; } __packed; - /* * Balloon response message; this message is sent from the guest * to the host in response to the balloon message. * * reservedz: Reserved; must be set to zero. * more_pages: If FALSE, this is the last message of the transaction. - * if TRUE there will atleast one more message from the guest. + * if TRUE there will be at least one more message from the guest. * * range_count: The number of ranges in the range array. * @@ -315,7 +304,7 @@ struct dm_balloon_response { * to the guest to give guest more memory. * * more_pages: If FALSE, this is the last message of the transaction. - * if TRUE there will atleast one more message from the guest. + * if TRUE there will be at least one more message from the guest. * * reservedz: Reserved; must be set to zero. * @@ -343,7 +332,6 @@ struct dm_unballoon_response { struct dm_header hdr; } __packed; - /* * Hot add request message. Message sent from the host to the guest. * @@ -391,7 +379,6 @@ enum dm_info_type { MAX_INFO_TYPE }; - /* * Header for the information message. */ @@ -481,10 +468,10 @@ static unsigned long last_post_time; static int hv_hypercall_multi_failure; -module_param(hot_add, bool, (S_IRUGO | S_IWUSR)); +module_param(hot_add, bool, 0644); MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add"); -module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR)); +module_param(pressure_report_delay, uint, 0644); MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure"); static atomic_t trans_id = ATOMIC_INIT(0); @@ -503,7 +490,6 @@ enum hv_dm_state { DM_INIT_ERROR }; - static __u8 recv_buffer[HV_HYP_PAGE_SIZE]; static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE]; @@ -599,12 +585,12 @@ static inline bool has_pfn_is_backed(struct hv_hotadd_state *has, struct hv_hotadd_gap *gap; /* The page is not backed. */ - if ((pfn < has->covered_start_pfn) || (pfn >= has->covered_end_pfn)) + if (pfn < has->covered_start_pfn || pfn >= has->covered_end_pfn) return false; /* Check for gaps. */ list_for_each_entry(gap, &has->gap_list, list) { - if ((pfn >= gap->start_pfn) && (pfn < gap->end_pfn)) + if (pfn >= gap->start_pfn && pfn < gap->end_pfn) return false; } @@ -784,8 +770,8 @@ static void hv_online_page(struct page *pg, unsigned int order) guard(spinlock_irqsave)(&dm_device.ha_lock); list_for_each_entry(has, &dm_device.ha_region_list, list) { /* The page belongs to a different HAS. */ - if ((pfn < has->start_pfn) || - (pfn + (1UL << order) > has->end_pfn)) + if (pfn < has->start_pfn || + (pfn + (1UL << order) > has->end_pfn)) continue; hv_bring_pgs_online(has, pfn, 1UL << order); @@ -846,7 +832,7 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) } static unsigned long handle_pg_range(unsigned long pg_start, - unsigned long pg_count) + unsigned long pg_count) { unsigned long start_pfn = pg_start; unsigned long pfn_cnt = pg_count; @@ -857,7 +843,7 @@ static unsigned long handle_pg_range(unsigned long pg_start, unsigned long res = 0, flags; pr_debug("Hot adding %lu pages starting at pfn 0x%lx.\n", pg_count, - pg_start); + pg_start); spin_lock_irqsave(&dm_device.ha_lock, flags); list_for_each_entry(has, &dm_device.ha_region_list, list) { @@ -893,10 +879,9 @@ static unsigned long handle_pg_range(unsigned long pg_start, if (start_pfn > has->start_pfn && online_section_nr(pfn_to_section_nr(start_pfn))) hv_bring_pgs_online(has, start_pfn, pgs_ol); - } - if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) { + if (has->ha_end_pfn < has->end_pfn && pfn_cnt > 0) { /* * We have some residual hot add range * that needs to be hot added; hot add @@ -999,7 +984,7 @@ static void hot_add_req(struct work_struct *dummy) rg_start = dm->ha_wrk.ha_region_range.finfo.start_page; rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt; - if ((rg_start == 0) && (!dm->host_specified_ha_region)) { + if (rg_start == 0 && !dm->host_specified_ha_region) { /* * The host has not specified the hot-add region. * Based on the hot-add page range being specified, @@ -1013,7 +998,7 @@ static void hot_add_req(struct work_struct *dummy) if (do_hot_add) resp.page_count = process_hot_add(pg_start, pfn_cnt, - rg_start, rg_sz); + rg_start, rg_sz); dm->num_pages_added += resp.page_count; #endif @@ -1191,11 +1176,10 @@ static void post_status(struct hv_dynmem_device *dm) sizeof(struct dm_status), (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); - } static void free_balloon_pages(struct hv_dynmem_device *dm, - union dm_mem_page_range *range_array) + union dm_mem_page_range *range_array) { int num_pages = range_array->finfo.page_cnt; __u64 start_frame = range_array->finfo.start_page; @@ -1211,8 +1195,6 @@ static void free_balloon_pages(struct hv_dynmem_device *dm, } } - - static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, unsigned int num_pages, struct dm_balloon_response *bl_resp, @@ -1258,7 +1240,6 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, page_to_pfn(pg); bl_resp->range_array[i].finfo.page_cnt = alloc_unit; bl_resp->hdr.size += sizeof(union dm_mem_page_range); - } return i * alloc_unit; @@ -1312,7 +1293,7 @@ static void balloon_up(struct work_struct *dummy) if (num_ballooned == 0 || num_ballooned == num_pages) { pr_debug("Ballooned %u out of %u requested pages.\n", - num_pages, dm_device.balloon_wrk.num_pages); + num_pages, dm_device.balloon_wrk.num_pages); bl_resp->more_pages = 0; done = true; @@ -1346,16 +1327,15 @@ static void balloon_up(struct work_struct *dummy) for (i = 0; i < bl_resp->range_count; i++) free_balloon_pages(&dm_device, - &bl_resp->range_array[i]); + &bl_resp->range_array[i]); done = true; } } - } static void balloon_down(struct hv_dynmem_device *dm, - struct dm_unballoon_request *req) + struct dm_unballoon_request *req) { union dm_mem_page_range *range_array = req->range_array; int range_count = req->range_count; @@ -1369,7 +1349,7 @@ static void balloon_down(struct hv_dynmem_device *dm, } pr_debug("Freed %u ballooned pages.\n", - prev_pages_ballooned - dm->num_pages_ballooned); + prev_pages_ballooned - dm->num_pages_ballooned); if (req->more_pages == 1) return; @@ -1394,8 +1374,7 @@ static int dm_thread_func(void *dm_dev) struct hv_dynmem_device *dm = dm_dev; while (!kthread_should_stop()) { - wait_for_completion_interruptible_timeout( - &dm_device.config_event, 1*HZ); + wait_for_completion_interruptible_timeout(&dm_device.config_event, 1 * HZ); /* * The host expects us to post information on the memory * pressure every second. @@ -1419,9 +1398,8 @@ static int dm_thread_func(void *dm_dev) return 0; } - static void version_resp(struct hv_dynmem_device *dm, - struct dm_version_response *vresp) + struct dm_version_response *vresp) { struct dm_version_request version_req; int ret; @@ -1482,7 +1460,7 @@ static void version_resp(struct hv_dynmem_device *dm, } static void cap_resp(struct hv_dynmem_device *dm, - struct dm_capabilities_resp_msg *cap_resp) + struct dm_capabilities_resp_msg *cap_resp) { if (!cap_resp->is_accepted) { pr_err("Capabilities not accepted by host\n"); @@ -1515,7 +1493,7 @@ static void balloon_onchannelcallback(void *context) switch (dm_hdr->type) { case DM_VERSION_RESPONSE: version_resp(dm, - (struct dm_version_response *)dm_msg); + (struct dm_version_response *)dm_msg); break; case DM_CAPABILITIES_RESPONSE: @@ -1545,7 +1523,7 @@ static void balloon_onchannelcallback(void *context) dm->state = DM_BALLOON_DOWN; balloon_down(dm, - (struct dm_unballoon_request *)recv_buffer); + (struct dm_unballoon_request *)recv_buffer); break; case DM_MEM_HOT_ADD_REQUEST: @@ -1583,17 +1561,15 @@ static void balloon_onchannelcallback(void *context) default: pr_warn_ratelimited("Unhandled message: type: %d\n", dm_hdr->type); - } } - } #define HV_LARGE_REPORTING_ORDER 9 #define HV_LARGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << \ HV_LARGE_REPORTING_ORDER) static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, - struct scatterlist *sgl, unsigned int nents) + struct scatterlist *sgl, unsigned int nents) { unsigned long flags; struct hv_memory_hint *hint; @@ -1628,7 +1604,7 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, */ /* page reporting for pages 2MB or higher */ - if (order >= HV_LARGE_REPORTING_ORDER ) { + if (order >= HV_LARGE_REPORTING_ORDER) { range->page.largepage = 1; range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB; range->base_large_pfn = page_to_hvpfn( @@ -1642,23 +1618,21 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, range->page.additional_pages = (sg->length / HV_HYP_PAGE_SIZE) - 1; } - } status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0, hint, NULL); local_irq_restore(flags); if (!hv_result_success(status)) { - pr_err("Cold memory discard hypercall failed with status %llx\n", - status); + status); if (hv_hypercall_multi_failure > 0) hv_hypercall_multi_failure++; if (hv_result(status) == HV_STATUS_INVALID_PARAMETER) { pr_err("Underlying Hyper-V does not support order less than 9. Hypercall failed\n"); pr_err("Defaulting to page_reporting_order %d\n", - pageblock_order); + pageblock_order); page_reporting_order = pageblock_order; hv_hypercall_multi_failure++; return -EINVAL; @@ -1692,7 +1666,7 @@ static void enable_page_reporting(void) pr_err("Failed to enable cold memory discard: %d\n", ret); } else { pr_info("Cold memory discard hint enabled with order %d\n", - page_reporting_order); + page_reporting_order); } } @@ -1775,7 +1749,7 @@ static int balloon_connect_vsp(struct hv_device *dev) if (ret) goto out; - t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); + t = wait_for_completion_timeout(&dm_device.host_event, 5 * HZ); if (t == 0) { ret = -ETIMEDOUT; goto out; @@ -1833,7 +1807,7 @@ static int balloon_connect_vsp(struct hv_device *dev) if (ret) goto out; - t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); + t = wait_for_completion_timeout(&dm_device.host_event, 5 * HZ); if (t == 0) { ret = -ETIMEDOUT; goto out; @@ -1874,8 +1848,8 @@ static int hv_balloon_debug_show(struct seq_file *f, void *offset) char *sname; seq_printf(f, "%-22s: %u.%u\n", "host_version", - DYNMEM_MAJOR_VERSION(dm->version), - DYNMEM_MINOR_VERSION(dm->version)); + DYNMEM_MAJOR_VERSION(dm->version), + DYNMEM_MINOR_VERSION(dm->version)); seq_printf(f, "%-22s:", "capabilities"); if (ballooning_enabled()) @@ -1924,10 +1898,10 @@ static int hv_balloon_debug_show(struct seq_file *f, void *offset) seq_printf(f, "%-22s: %u\n", "pages_ballooned", dm->num_pages_ballooned); seq_printf(f, "%-22s: %lu\n", "total_pages_committed", - get_pages_committed(dm)); + get_pages_committed(dm)); seq_printf(f, "%-22s: %llu\n", "max_dynamic_page_count", - dm->max_dynamic_page_count); + dm->max_dynamic_page_count); return 0; } @@ -1937,7 +1911,7 @@ DEFINE_SHOW_ATTRIBUTE(hv_balloon_debug); static void hv_balloon_debugfs_init(struct hv_dynmem_device *b) { debugfs_create_file("hv-balloon", 0444, NULL, b, - &hv_balloon_debug_fops); + &hv_balloon_debug_fops); } static void hv_balloon_debugfs_exit(struct hv_dynmem_device *b) @@ -2095,7 +2069,6 @@ static int balloon_suspend(struct hv_device *hv_dev) tasklet_enable(&hv_dev->channel->callback_event); return 0; - } static int balloon_resume(struct hv_device *dev) @@ -2154,7 +2127,6 @@ static struct hv_driver balloon_drv = { static int __init init_balloon_drv(void) { - return vmbus_driver_register(&balloon_drv); } From 0d92e4a7ffd5c42b9fa864692f82476c0bf8bcc8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 5 Jun 2024 18:56:37 +0100 Subject: [PATCH 024/272] KVM: arm64: Disassociate vcpus from redistributor region on teardown When tearing down a redistributor region, make sure we don't have any dangling pointer to that region stored in a vcpu. Fixes: e5a35635464b ("kvm: arm64: vgic-v3: Introduce vgic_v3_free_redist_region()") Reported-by: Alexander Potapenko Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20240605175637.1635653-1-maz@kernel.org Cc: stable@vger.kernel.org --- arch/arm64/kvm/vgic/vgic-init.c | 2 +- arch/arm64/kvm/vgic/vgic-mmio-v3.c | 15 +++++++++++++-- arch/arm64/kvm/vgic/vgic.h | 2 +- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 8f5b7a3e7009e..7f68cf58b978f 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -391,7 +391,7 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm) if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) - vgic_v3_free_redist_region(rdreg); + vgic_v3_free_redist_region(kvm, rdreg); INIT_LIST_HEAD(&dist->rd_regions); } else { dist->vgic_cpu_base = VGIC_ADDR_UNDEF; diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index a3983a631b5ad..9e50928f5d7df 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -919,8 +919,19 @@ static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index, return ret; } -void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg) +void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg) { + struct kvm_vcpu *vcpu; + unsigned long c; + + lockdep_assert_held(&kvm->arch.config_lock); + + /* Garbage collect the region */ + kvm_for_each_vcpu(c, vcpu, kvm) { + if (vcpu->arch.vgic_cpu.rdreg == rdreg) + vcpu->arch.vgic_cpu.rdreg = NULL; + } + list_del(&rdreg->list); kfree(rdreg); } @@ -945,7 +956,7 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count) mutex_lock(&kvm->arch.config_lock); rdreg = vgic_v3_rdist_region_from_index(kvm, index); - vgic_v3_free_redist_region(rdreg); + vgic_v3_free_redist_region(kvm, rdreg); mutex_unlock(&kvm->arch.config_lock); return ret; } diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 6106ebd5ba429..03d356a123771 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -316,7 +316,7 @@ vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg) struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, u32 index); -void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg); +void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg); bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size); From 0fc670d07d5de36a54f061f457743c9cde1d8b46 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Mon, 3 Jun 2024 14:20:46 +0200 Subject: [PATCH 025/272] KVM: selftests: Fix RISC-V compilation Due to commit 2b7deea3ec7c ("Revert "kvm: selftests: move base kvm_util.h declarations to kvm_util_base.h"") kvm selftests now requires explicitly including ucall_common.h when needed. The commit added the directives everywhere they were needed at the time, but, by merge time, new places had been merged for RISC-V. Add those now to fix RISC-V's compilation. Fixes: dee7ea42a1eb ("Merge tag 'kvm-x86-selftests_utils-6.10' of https://github.com/kvm-x86/linux into HEAD") Signed-off-by: Andrew Jones Link: https://lore.kernel.org/r/20240603122045.323064-2-ajones@ventanamicro.com Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/lib/riscv/ucall.c | 1 + tools/testing/selftests/kvm/riscv/ebreak_test.c | 1 + tools/testing/selftests/kvm/riscv/sbi_pmu_test.c | 1 + 3 files changed, 3 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/riscv/ucall.c b/tools/testing/selftests/kvm/lib/riscv/ucall.c index 14ee17151a590..b5035c63d5163 100644 --- a/tools/testing/selftests/kvm/lib/riscv/ucall.c +++ b/tools/testing/selftests/kvm/lib/riscv/ucall.c @@ -9,6 +9,7 @@ #include "kvm_util.h" #include "processor.h" +#include "sbi.h" void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) { diff --git a/tools/testing/selftests/kvm/riscv/ebreak_test.c b/tools/testing/selftests/kvm/riscv/ebreak_test.c index 823c132069b46..0e07128549538 100644 --- a/tools/testing/selftests/kvm/riscv/ebreak_test.c +++ b/tools/testing/selftests/kvm/riscv/ebreak_test.c @@ -6,6 +6,7 @@ * */ #include "kvm_util.h" +#include "ucall_common.h" #define LABEL_ADDRESS(v) ((uint64_t)&(v)) diff --git a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c index 69bb94e6b2276..f299cbfd23ca0 100644 --- a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c +++ b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c @@ -15,6 +15,7 @@ #include "processor.h" #include "sbi.h" #include "arch_timer.h" +#include "ucall_common.h" /* Maximum counters(firmware + hardware) */ #define RISCV_MAX_PMU_COUNTERS 64 From 0ee14725471cea66e03e3cd4f4c582d759de502c Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 6 Jun 2024 15:46:09 +0100 Subject: [PATCH 026/272] mm/util: Swap kmemdup_array() arguments GCC 14.1 complains about the argument usage of kmemdup_array(): drivers/soc/tegra/fuse/fuse-tegra.c:130:65: error: 'kmemdup_array' sizes specified with 'sizeof' in the earlier argument and not in the later argument [-Werror=calloc-transposed-args] 130 | fuse->lookups = kmemdup_array(fuse->soc->lookups, sizeof(*fuse->lookups), | ^ drivers/soc/tegra/fuse/fuse-tegra.c:130:65: note: earlier argument should specify number of elements, later size of each element The annotation introduced by commit 7d78a7773355 ("string: Add additional __realloc_size() annotations for "dup" helpers") lets the compiler think that kmemdup_array() follows the same format as calloc(), with the number of elements preceding the size of one element. So we could simply swap the arguments to __realloc_size() to get rid of that warning, but it seems cleaner to instead have kmemdup_array() follow the same format as krealloc_array(), memdup_array_user(), calloc() etc. Fixes: 7d78a7773355 ("string: Add additional __realloc_size() annotations for "dup" helpers") Signed-off-by: Jean-Philippe Brucker Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240606144608.97817-2-jean-philippe@linaro.org Signed-off-by: Kees Cook --- drivers/soc/tegra/fuse/fuse-tegra.c | 4 ++-- include/linux/string.h | 2 +- lib/fortify_kunit.c | 2 +- mm/util.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c index b6bfd6729df39..d276672838465 100644 --- a/drivers/soc/tegra/fuse/fuse-tegra.c +++ b/drivers/soc/tegra/fuse/fuse-tegra.c @@ -127,8 +127,8 @@ static void tegra_fuse_print_sku_info(struct tegra_sku_info *tegra_sku_info) static int tegra_fuse_add_lookups(struct tegra_fuse *fuse) { - fuse->lookups = kmemdup_array(fuse->soc->lookups, sizeof(*fuse->lookups), - fuse->soc->num_lookups, GFP_KERNEL); + fuse->lookups = kmemdup_array(fuse->soc->lookups, fuse->soc->num_lookups, + sizeof(*fuse->lookups), GFP_KERNEL); if (!fuse->lookups) return -ENOMEM; diff --git a/include/linux/string.h b/include/linux/string.h index 60168aa2af075..9edace076ddbf 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -289,7 +289,7 @@ extern void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) __realloc_si extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); -extern void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp) +extern void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) __realloc_size(2, 3); /* lib/argv_split.c */ diff --git a/lib/fortify_kunit.c b/lib/fortify_kunit.c index f9cc467334ce3..e17d520f532cf 100644 --- a/lib/fortify_kunit.c +++ b/lib/fortify_kunit.c @@ -374,7 +374,7 @@ static const char * const test_strs[] = { for (i = 0; i < ARRAY_SIZE(test_strs); i++) { \ len = strlen(test_strs[i]); \ KUNIT_EXPECT_EQ(test, __builtin_constant_p(len), 0); \ - checker(len, kmemdup_array(test_strs[i], len, 1, gfp), \ + checker(len, kmemdup_array(test_strs[i], 1, len, gfp), \ kfree(p)); \ checker(len, kmemdup(test_strs[i], len, gfp), \ kfree(p)); \ diff --git a/mm/util.c b/mm/util.c index c9e519e6811f5..6682097372efc 100644 --- a/mm/util.c +++ b/mm/util.c @@ -139,14 +139,14 @@ EXPORT_SYMBOL(kmemdup_noprof); * kmemdup_array - duplicate a given array. * * @src: array to duplicate. - * @element_size: size of each element of array. * @count: number of elements to duplicate from array. + * @element_size: size of each element of array. * @gfp: GFP mask to use. * * Return: duplicated array of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ -void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp) +void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) { return kmemdup(src, size_mul(element_size, count), gfp); } From f7d3b1ffc654b0435ac2c9c02a72fb2752bdb0fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Fri, 15 Mar 2024 13:54:10 +0100 Subject: [PATCH 027/272] yama: document function parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document the unused function parameter of yama_relation_cleanup() to please kernel doc warnings. Signed-off-by: Christian Göttsche Reviewed-by: Paul Moore Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20240315125418.273104-2-cgzones@googlemail.com Signed-off-by: Kees Cook --- security/yama/yama_lsm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index b6684a074a59b..39944a859ff6b 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -111,6 +111,7 @@ static void report_access(const char *access, struct task_struct *target, /** * yama_relation_cleanup - remove invalid entries from the relation list + * @work: unused * */ static void yama_relation_cleanup(struct work_struct *work) From 60980cf5b8c8cc9182e5e9dbb62cbfd345c54074 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Fri, 7 Jun 2024 11:34:23 +0100 Subject: [PATCH 028/272] spi: cs42l43: Drop cs35l56 SPI speed down to 11MHz Some internals of the cs35l56 can only support SPI speeds of up to 11MHz. Whilst some use-cases could support higher rates, keep things simple by dropping the SPI speed down to this avoid any potential issues. Signed-off-by: Charles Keepax Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240607103423.4159834-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- drivers/spi/spi-cs42l43.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-cs42l43.c b/drivers/spi/spi-cs42l43.c index 902a0734cc361..8b618ef0f7111 100644 --- a/drivers/spi/spi-cs42l43.c +++ b/drivers/spi/spi-cs42l43.c @@ -54,7 +54,7 @@ static const struct software_node ampr = { static struct spi_board_info ampl_info = { .modalias = "cs35l56", - .max_speed_hz = 20 * HZ_PER_MHZ, + .max_speed_hz = 11 * HZ_PER_MHZ, .chip_select = 0, .mode = SPI_MODE_0, .swnode = &l, @@ -62,7 +62,7 @@ static struct spi_board_info ampl_info = { static struct spi_board_info ampr_info = { .modalias = "cs35l56", - .max_speed_hz = 20 * HZ_PER_MHZ, + .max_speed_hz = 11 * HZ_PER_MHZ, .chip_select = 1, .mode = SPI_MODE_0, .swnode = &r, From 462237d2d93fc9e9221d1cf9f773954d27da83c0 Mon Sep 17 00:00:00 2001 From: Louis Chauvet Date: Fri, 7 Jun 2024 10:34:38 +0200 Subject: [PATCH 029/272] dmaengine: xilinx: xdma: Fix data synchronisation in xdma_channel_isr() Requests the vchan lock before using xdma->stop_request. Fixes: 6a40fb824596 ("dmaengine: xilinx: xdma: Fix synchronization issue") Cc: stable@vger.kernel.org Signed-off-by: Louis Chauvet Link: https://lore.kernel.org/r/20240607-xdma-fixes-v2-1-0282319ce345@bootlin.com Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index e143a73308161..718842fdaf98e 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -885,11 +885,11 @@ static irqreturn_t xdma_channel_isr(int irq, void *dev_id) u32 st; bool repeat_tx; + spin_lock(&xchan->vchan.lock); + if (xchan->stop_requested) complete(&xchan->last_interrupt); - spin_lock(&xchan->vchan.lock); - /* get submitted request */ vd = vchan_next_desc(&xchan->vchan); if (!vd) From f67ac0061c7614c1548963d3ef1ee1606efd8636 Mon Sep 17 00:00:00 2001 From: Honggang LI Date: Thu, 23 May 2024 17:46:17 +0800 Subject: [PATCH 030/272] RDMA/rxe: Fix responder length checking for UD request packets According to the IBA specification: If a UD request packet is detected with an invalid length, the request shall be an invalid request and it shall be silently dropped by the responder. The responder then waits for a new request packet. commit 689c5421bfe0 ("RDMA/rxe: Fix incorrect responder length checking") defers responder length check for UD QPs in function `copy_data`. But it introduces a regression issue for UD QPs. When the packet size is too large to fit in the receive buffer. `copy_data` will return error code -EINVAL. Then `send_data_in` will return RESPST_ERR_MALFORMED_WQE. UD QP will transfer into ERROR state. Fixes: 689c5421bfe0 ("RDMA/rxe: Fix incorrect responder length checking") Signed-off-by: Honggang LI Link: https://lore.kernel.org/r/20240523094617.141148-1-honggangli@163.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_resp.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index c6a7fa3054fad..6596a85723c9a 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -344,6 +344,19 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, * receive buffer later. For rmda operations additional * length checks are performed in check_rkey. */ + if ((qp_type(qp) == IB_QPT_GSI) || (qp_type(qp) == IB_QPT_UD)) { + unsigned int payload = payload_size(pkt); + unsigned int recv_buffer_len = 0; + int i; + + for (i = 0; i < qp->resp.wqe->dma.num_sge; i++) + recv_buffer_len += qp->resp.wqe->dma.sge[i].length; + if (payload + 40 > recv_buffer_len) { + rxe_dbg_qp(qp, "The receive buffer is too small for this UD packet.\n"); + return RESPST_ERR_LENGTH; + } + } + if (pkt->mask & RXE_PAYLOAD_MASK && ((qp_type(qp) == IB_QPT_RC) || (qp_type(qp) == IB_QPT_UC))) { unsigned int mtu = qp->mtu; From 9dd5134c61580ba4c219296c37e08ff64c109a74 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 10 Jun 2024 11:23:05 -0700 Subject: [PATCH 031/272] kunit/overflow: Adjust for __counted_by with DEFINE_RAW_FLEX() When a flexible array structure has a __counted_by annotation, its use with DEFINE_RAW_FLEX() will result in the count being zero-initialized. This is expected since one doesn't want to use RAW with a counted_by struct. Adjust the tests to check for the condition and for compiler support. Reported-by: Christian Schrefl Closes: https://lore.kernel.org/all/0bfc6b38-8bc5-4971-b6fb-dc642a73fbfe@gmail.com/ Suggested-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240610182301.work.272-kees@kernel.org Tested-by: Christian Schrefl Reviewed-by: Christian Schrefl Signed-off-by: Kees Cook --- lib/overflow_kunit.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/lib/overflow_kunit.c b/lib/overflow_kunit.c index 4ef31b0bb74d6..d305b0c054bb7 100644 --- a/lib/overflow_kunit.c +++ b/lib/overflow_kunit.c @@ -1178,14 +1178,28 @@ struct foo { s16 array[] __counted_by(counter); }; +struct bar { + int a; + u32 counter; + s16 array[]; +}; + static void DEFINE_FLEX_test(struct kunit *test) { - DEFINE_RAW_FLEX(struct foo, two, array, 2); + /* Using _RAW_ on a __counted_by struct will initialize "counter" to zero */ + DEFINE_RAW_FLEX(struct foo, two_but_zero, array, 2); +#if __has_attribute(__counted_by__) + int expected_raw_size = sizeof(struct foo); +#else + int expected_raw_size = sizeof(struct foo) + 2 * sizeof(s16); +#endif + /* Without annotation, it will always be on-stack size. */ + DEFINE_RAW_FLEX(struct bar, two, array, 2); DEFINE_FLEX(struct foo, eight, array, counter, 8); DEFINE_FLEX(struct foo, empty, array, counter, 0); - KUNIT_EXPECT_EQ(test, __struct_size(two), - sizeof(struct foo) + sizeof(s16) + sizeof(s16)); + KUNIT_EXPECT_EQ(test, __struct_size(two_but_zero), expected_raw_size); + KUNIT_EXPECT_EQ(test, __struct_size(two), sizeof(struct bar) + 2 * sizeof(s16)); KUNIT_EXPECT_EQ(test, __struct_size(eight), 24); KUNIT_EXPECT_EQ(test, __struct_size(empty), sizeof(struct foo)); } From 3f60497c658d2072714d097a177612d34b34aa3d Mon Sep 17 00:00:00 2001 From: Biju Das Date: Mon, 10 Jun 2024 20:55:32 +0100 Subject: [PATCH 032/272] regulator: core: Fix modpost error "regulator_get_regmap" undefined Fix the modpost error "regulator_get_regmap" undefined by adding export symbol. Fixes: 04eca28cde52 ("regulator: Add helpers for low-level register access") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202406110117.mk5UR3VZ-lkp@intel.com Signed-off-by: Biju Das Link: https://lore.kernel.org/r/20240610195532.175942-1-biju.das.jz@bp.renesas.com Signed-off-by: Mark Brown --- drivers/regulator/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 5794f4e9dd529..844e9587a880f 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -3347,6 +3347,7 @@ struct regmap *regulator_get_regmap(struct regulator *regulator) return map ? map : ERR_PTR(-EOPNOTSUPP); } +EXPORT_SYMBOL_GPL(regulator_get_regmap); /** * regulator_get_hardware_vsel_register - get the HW voltage selector register From ae9daffd9028f2500c9ac1517e46d4f2b57efb80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 8 May 2024 15:07:00 +0300 Subject: [PATCH 033/272] MIPS: Routerboard 532: Fix vendor retry check code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit read_config_dword() contains strange condition checking ret for a number of values. The ret variable, however, is always zero because config_access() never returns anything else. Thus, the retry is always taken until number of tries is exceeded. The code looks like it wants to check *val instead of ret to see if the read gave an error response. Fixes: 73b4390fb234 ("[MIPS] Routerboard 532: Support for base system") Signed-off-by: Ilpo Järvinen Signed-off-by: Thomas Bogendoerfer --- arch/mips/pci/ops-rc32434.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/pci/ops-rc32434.c b/arch/mips/pci/ops-rc32434.c index 874ed6df97683..34b9323bdabb0 100644 --- a/arch/mips/pci/ops-rc32434.c +++ b/arch/mips/pci/ops-rc32434.c @@ -112,8 +112,8 @@ static int read_config_dword(struct pci_bus *bus, unsigned int devfn, * gives them time to settle */ if (where == PCI_VENDOR_ID) { - if (ret == 0xffffffff || ret == 0x00000000 || - ret == 0x0000ffff || ret == 0xffff0000) { + if (*val == 0xffffffff || *val == 0x00000000 || + *val == 0x0000ffff || *val == 0xffff0000) { if (delay > 4) return 0; delay *= 2; From 277a0363120276645ae598d8d5fea7265e076ae9 Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Fri, 7 Jun 2024 11:04:00 +0200 Subject: [PATCH 034/272] MIPS: pci: lantiq: restore reset gpio polarity Commit 90c2d2eb7ab5 ("MIPS: pci: lantiq: switch to using gpiod API") not only switched to the gpiod API, but also inverted / changed the polarity of the GPIO. According to the PCI specification, the RST# pin is an active-low signal. However, most of the device trees that have been widely used for a long time (mainly in the openWrt project) define this GPIO as active-high and the old driver code inverted the signal internally. Apparently there are actually boards where the reset gpio must be operated inverted. For this reason, we cannot use the GPIOD_OUT_LOW/HIGH flag for initialization. Instead, we must explicitly set the gpio to value 1 in order to take into account any "GPIO_ACTIVE_LOW" flag that may have been set. In order to remain compatible with all these existing device trees, we should therefore keep the logic as it was before the commit. Fixes: 90c2d2eb7ab5 ("MIPS: pci: lantiq: switch to using gpiod API") Cc: stable@vger.kernel.org Signed-off-by: Martin Schiller Signed-off-by: Thomas Bogendoerfer --- arch/mips/pci/pci-lantiq.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/mips/pci/pci-lantiq.c b/arch/mips/pci/pci-lantiq.c index 68a8cefed420b..0844db34022e4 100644 --- a/arch/mips/pci/pci-lantiq.c +++ b/arch/mips/pci/pci-lantiq.c @@ -124,14 +124,14 @@ static int ltq_pci_startup(struct platform_device *pdev) clk_disable(clk_external); /* setup reset gpio used by pci */ - reset_gpio = devm_gpiod_get_optional(&pdev->dev, "reset", - GPIOD_OUT_LOW); + reset_gpio = devm_gpiod_get_optional(&pdev->dev, "reset", GPIOD_ASIS); error = PTR_ERR_OR_ZERO(reset_gpio); if (error) { dev_err(&pdev->dev, "failed to request gpio: %d\n", error); return error; } gpiod_set_consumer_name(reset_gpio, "pci_reset"); + gpiod_direction_output(reset_gpio, 1); /* enable auto-switching between PCI and EBU */ ltq_pci_w32(0xa, PCI_CR_CLK_CTRL); @@ -194,10 +194,10 @@ static int ltq_pci_startup(struct platform_device *pdev) /* toggle reset pin */ if (reset_gpio) { - gpiod_set_value_cansleep(reset_gpio, 1); + gpiod_set_value_cansleep(reset_gpio, 0); wmb(); mdelay(1); - gpiod_set_value_cansleep(reset_gpio, 0); + gpiod_set_value_cansleep(reset_gpio, 1); } return 0; } From ce5cdd3b05216b704a704f466fb4c2dff3778caf Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Tue, 11 Jun 2024 13:35:33 +0200 Subject: [PATCH 035/272] mips: bmips: BCM6358: make sure CBR is correctly set It was discovered that some device have CBR address set to 0 causing kernel panic when arch_sync_dma_for_cpu_all is called. This was notice in situation where the system is booted from TP1 and BMIPS_GET_CBR() returns 0 instead of a valid address and !!(read_c0_brcm_cmt_local() & (1 << 31)); not failing. The current check whether RAC flush should be disabled or not are not enough hence lets check if CBR is a valid address or not. Fixes: ab327f8acdf8 ("mips: bmips: BCM6358: disable RAC flush for TP1") Signed-off-by: Christian Marangi Acked-by: Florian Fainelli Signed-off-by: Thomas Bogendoerfer --- arch/mips/bmips/setup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/bmips/setup.c b/arch/mips/bmips/setup.c index ec180ab92eaa8..66a8ba19c2872 100644 --- a/arch/mips/bmips/setup.c +++ b/arch/mips/bmips/setup.c @@ -110,7 +110,8 @@ static void bcm6358_quirks(void) * RAC flush causes kernel panics on BCM6358 when booting from TP1 * because the bootloader is not initializing it properly. */ - bmips_rac_flush_disable = !!(read_c0_brcm_cmt_local() & (1 << 31)); + bmips_rac_flush_disable = !!(read_c0_brcm_cmt_local() & (1 << 31)) || + !!BMIPS_GET_CBR(); } static void bcm6368_quirks(void) From 2049aad5d3a6921f80121029afe6fbcfb2727861 Mon Sep 17 00:00:00 2001 From: Amer Al Shanawany Date: Wed, 17 Apr 2024 20:49:13 +0200 Subject: [PATCH 036/272] selftests: filesystems: fix warn_unused_result build warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following warnings by adding return check and error messages. statmount_test.c: In function ‘cleanup_namespace’: statmount_test.c:128:9: warning: ignoring return value of ‘fchdir’ declared with attribute ‘warn_unused_result’ [-Wunused-result] 128 | fchdir(orig_root); | ^~~~~~~~~~~~~~~~~ statmount_test.c:129:9: warning: ignoring return value of ‘chroot’ declared with attribute ‘warn_unused_result’ [-Wunused-result] 129 | chroot("."); | ^~~~~~~~~~~ Signed-off-by: Amer Al Shanawany Reviewed-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- .../selftests/filesystems/statmount/statmount_test.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c index e6d7c4f1c85b5..e8c019d72cbf3 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -125,8 +125,16 @@ static uint32_t old_root_id, old_parent_id; static void cleanup_namespace(void) { - fchdir(orig_root); - chroot("."); + int ret; + + ret = fchdir(orig_root); + if (ret == -1) + ksft_perror("fchdir to original root"); + + ret = chroot("."); + if (ret == -1) + ksft_perror("chroot to original root"); + umount2(root_mntpoint, MNT_DETACH); rmdir(root_mntpoint); } From 04e1f99afe8bec27ad2d2726897ac8185bf0532c Mon Sep 17 00:00:00 2001 From: Amer Al Shanawany Date: Tue, 11 Jun 2024 17:16:08 +0200 Subject: [PATCH 037/272] selftests: seccomp: fix format-zero-length warnings fix the following errors by using string format specifier and an empty parameter: seccomp_benchmark.c:197:24: warning: zero-length gnu_printf format string [-Wformat-zero-length] 197 | ksft_print_msg(""); | ^~ seccomp_benchmark.c:202:24: warning: zero-length gnu_printf format string [-Wformat-zero-length] 202 | ksft_print_msg(""); | ^~ seccomp_benchmark.c:204:24: warning: zero-length gnu_printf format string [-Wformat-zero-length] 204 | ksft_print_msg(""); | ^~ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312260235.Uj5ug8K9-lkp@intel.com/ Suggested-by: Kees Cook Signed-off-by: Amer Al Shanawany Signed-off-by: Shuah Khan --- tools/testing/selftests/seccomp/seccomp_benchmark.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c index b83099160fbca..94886c82ae609 100644 --- a/tools/testing/selftests/seccomp/seccomp_benchmark.c +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -194,14 +194,14 @@ int main(int argc, char *argv[]) ksft_set_plan(7); ksft_print_msg("Running on:\n"); - ksft_print_msg(""); + ksft_print_msg("%s", ""); system("uname -a"); ksft_print_msg("Current BPF sysctl settings:\n"); /* Avoid using "sysctl" which may not be installed. */ - ksft_print_msg(""); + ksft_print_msg("%s", ""); system("grep -H . /proc/sys/net/core/bpf_jit_enable"); - ksft_print_msg(""); + ksft_print_msg("%s", ""); system("grep -H . /proc/sys/net/core/bpf_jit_harden"); affinity(); From e3215deca4520773cd2b155bed164c12365149a7 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 3 Jun 2024 09:24:44 +0800 Subject: [PATCH 038/272] dmaengine: idxd: Fix possible Use-After-Free in irq_process_work_list Use list_for_each_entry_safe() to allow iterating through the list and deleting the entry in the iteration process. The descriptor is freed via idxd_desc_complete() and there's a slight chance may cause issue for the list iterator when the descriptor is reused by another thread without it being deleted from the list. Fixes: 16e19e11228b ("dmaengine: idxd: Fix list corruption in description completion") Signed-off-by: Li RongQing Reviewed-by: Dave Jiang Reviewed-by: Fenghua Yu Link: https://lore.kernel.org/r/20240603012444.11902-1-lirongqing@baidu.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/irq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c index 8dc029c865515..fc049c9c9892e 100644 --- a/drivers/dma/idxd/irq.c +++ b/drivers/dma/idxd/irq.c @@ -611,11 +611,13 @@ static void irq_process_work_list(struct idxd_irq_entry *irq_entry) spin_unlock(&irq_entry->list_lock); - list_for_each_entry(desc, &flist, list) { + list_for_each_entry_safe(desc, n, &flist, list) { /* * Check against the original status as ABORT is software defined * and 0xff, which DSA_COMP_STATUS_MASK can mask out. */ + list_del(&desc->list); + if (unlikely(desc->completion->status == IDXD_COMP_DESC_ABORT)) { idxd_desc_complete(desc, IDXD_COMPLETE_ABORT, true); continue; From ba27e9d2207784da748b19170a2e56bd7770bd81 Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Sun, 2 Jun 2024 07:03:19 +0530 Subject: [PATCH 039/272] dmaengine: ti: k3-udma-glue: Fix of_k3_udma_glue_parse_chn_by_id() The of_k3_udma_glue_parse_chn_by_id() helper function erroneously invokes "of_node_put()" on the "udmax_np" device-node passed to it, without having incremented its reference count at any point. Fix it. Fixes: 81a1f90f20af ("dmaengine: ti: k3-udma-glue: Add function to parse channel by ID") Signed-off-by: Siddharth Vadapalli Acked-by: Peter Ujfalusi Acked-by: Peter Ujfalusi@gmail.com Link: https://lore.kernel.org/r/20240602013319.2975894-1-s-vadapalli@ti.com Signed-off-by: Vinod Koul --- drivers/dma/ti/k3-udma-glue.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/dma/ti/k3-udma-glue.c b/drivers/dma/ti/k3-udma-glue.c index c9b93055dc9d3..f0a399cf45b2a 100644 --- a/drivers/dma/ti/k3-udma-glue.c +++ b/drivers/dma/ti/k3-udma-glue.c @@ -200,12 +200,9 @@ of_k3_udma_glue_parse_chn_by_id(struct device_node *udmax_np, struct k3_udma_glu ret = of_k3_udma_glue_parse(udmax_np, common); if (ret) - goto out_put_spec; + return ret; ret = of_k3_udma_glue_parse_chn_common(common, thread_id, tx_chn); - -out_put_spec: - of_node_put(udmax_np); return ret; } From 1b11b4ef6bd68591dcaf8423c7d05e794e6aec6f Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Tue, 28 May 2024 09:09:23 +0300 Subject: [PATCH 040/272] dmaengine: ioatdma: Fix leaking on version mismatch Fix leaking ioatdma_device if I/OAT version is less than IOAT_VER_3_0. Fixes: bf453a0a18b2 ("dmaengine: ioat: Support in-use unbind") Signed-off-by: Nikita Shubin Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20240528-ioatdma-fixes-v2-1-a9f2fbe26ab1@yadro.com Signed-off-by: Vinod Koul --- drivers/dma/ioat/init.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c index 9c364e92cb828..e76e507ae898c 100644 --- a/drivers/dma/ioat/init.c +++ b/drivers/dma/ioat/init.c @@ -1350,6 +1350,7 @@ static int ioat_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) void __iomem * const *iomap; struct device *dev = &pdev->dev; struct ioatdma_device *device; + u8 version; int err; err = pcim_enable_device(pdev); @@ -1363,6 +1364,10 @@ static int ioat_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!iomap) return -ENOMEM; + version = readb(iomap[IOAT_MMIO_BAR] + IOAT_VER_OFFSET); + if (version < IOAT_VER_3_0) + return -ENODEV; + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); if (err) return err; @@ -1373,16 +1378,14 @@ static int ioat_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) pci_set_master(pdev); pci_set_drvdata(pdev, device); - device->version = readb(device->reg_base + IOAT_VER_OFFSET); + device->version = version; if (device->version >= IOAT_VER_3_4) ioat_dca_enabled = 0; - if (device->version >= IOAT_VER_3_0) { - if (is_skx_ioat(pdev)) - device->version = IOAT_VER_3_2; - err = ioat3_dma_probe(device, ioat_dca_enabled); - } else - return -ENODEV; + if (is_skx_ioat(pdev)) + device->version = IOAT_VER_3_2; + + err = ioat3_dma_probe(device, ioat_dca_enabled); if (err) { dev_err(dev, "Intel(R) I/OAT DMA Engine init failed\n"); return -ENODEV; From f0dc9fda2e0ee9e01496c2f5aca3a831131fad79 Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Tue, 28 May 2024 09:09:24 +0300 Subject: [PATCH 041/272] dmaengine: ioatdma: Fix error path in ioat3_dma_probe() Make sure we are disabling interrupts and destroying DMA pool if pcie_capability_read/write_word() call failed. Fixes: 511deae0261c ("dmaengine: ioatdma: disable relaxed ordering for ioatdma") Signed-off-by: Nikita Shubin Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20240528-ioatdma-fixes-v2-2-a9f2fbe26ab1@yadro.com Signed-off-by: Vinod Koul --- drivers/dma/ioat/init.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c index e76e507ae898c..26964b7c8cf14 100644 --- a/drivers/dma/ioat/init.c +++ b/drivers/dma/ioat/init.c @@ -534,18 +534,6 @@ static int ioat_probe(struct ioatdma_device *ioat_dma) return err; } -static int ioat_register(struct ioatdma_device *ioat_dma) -{ - int err = dma_async_device_register(&ioat_dma->dma_dev); - - if (err) { - ioat_disable_interrupts(ioat_dma); - dma_pool_destroy(ioat_dma->completion_pool); - } - - return err; -} - static void ioat_dma_remove(struct ioatdma_device *ioat_dma) { struct dma_device *dma = &ioat_dma->dma_dev; @@ -1181,9 +1169,9 @@ static int ioat3_dma_probe(struct ioatdma_device *ioat_dma, int dca) ioat_chan->reg_base + IOAT_DCACTRL_OFFSET); } - err = ioat_register(ioat_dma); + err = dma_async_device_register(&ioat_dma->dma_dev); if (err) - return err; + goto err_disable_interrupts; ioat_kobject_add(ioat_dma, &ioat_ktype); @@ -1192,20 +1180,29 @@ static int ioat3_dma_probe(struct ioatdma_device *ioat_dma, int dca) /* disable relaxed ordering */ err = pcie_capability_read_word(pdev, PCI_EXP_DEVCTL, &val16); - if (err) - return pcibios_err_to_errno(err); + if (err) { + err = pcibios_err_to_errno(err); + goto err_disable_interrupts; + } /* clear relaxed ordering enable */ val16 &= ~PCI_EXP_DEVCTL_RELAX_EN; err = pcie_capability_write_word(pdev, PCI_EXP_DEVCTL, val16); - if (err) - return pcibios_err_to_errno(err); + if (err) { + err = pcibios_err_to_errno(err); + goto err_disable_interrupts; + } if (ioat_dma->cap & IOAT_CAP_DPS) writeb(ioat_pending_level + 1, ioat_dma->reg_base + IOAT_PREFETCH_LIMIT_OFFSET); return 0; + +err_disable_interrupts: + ioat_disable_interrupts(ioat_dma); + dma_pool_destroy(ioat_dma->completion_pool); + return err; } static void ioat_shutdown(struct pci_dev *pdev) From 29b7cd255f3628e0d65be33a939d8b5bba10aa62 Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Tue, 28 May 2024 09:09:25 +0300 Subject: [PATCH 042/272] dmaengine: ioatdma: Fix kmemleak in ioat_pci_probe() If probing fails we end up with leaking ioatdma_device and each allocated channel. Following kmemleak easy to reproduce by injecting an error in ioat_alloc_chan_resources() when doing ioat_dma_self_test(). unreferenced object 0xffff888014ad5800 (size 1024): [..] [] kmemleak_alloc+0x4a/0x80 [] kmalloc_trace+0x270/0x2f0 [] ioat_pci_probe+0xc1/0x1c0 [ioatdma] [..] repeated for each ioatdma channel: unreferenced object 0xffff8880148e5c00 (size 512): [..] [] kmemleak_alloc+0x4a/0x80 [] kmalloc_trace+0x270/0x2f0 [] ioat_enumerate_channels+0x101/0x2d0 [ioatdma] [] ioat3_dma_probe+0x4d6/0x970 [ioatdma] [] ioat_pci_probe+0x181/0x1c0 [ioatdma] [..] Fixes: bf453a0a18b2 ("dmaengine: ioat: Support in-use unbind") Signed-off-by: Nikita Shubin Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20240528-ioatdma-fixes-v2-3-a9f2fbe26ab1@yadro.com Signed-off-by: Vinod Koul --- drivers/dma/ioat/init.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c index 26964b7c8cf14..cf688b0c8444c 100644 --- a/drivers/dma/ioat/init.c +++ b/drivers/dma/ioat/init.c @@ -1347,6 +1347,7 @@ static int ioat_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) void __iomem * const *iomap; struct device *dev = &pdev->dev; struct ioatdma_device *device; + unsigned int i; u8 version; int err; @@ -1384,6 +1385,9 @@ static int ioat_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) err = ioat3_dma_probe(device, ioat_dca_enabled); if (err) { + for (i = 0; i < IOAT_MAX_CHANS; i++) + kfree(device->idx[i]); + kfree(device); dev_err(dev, "Intel(R) I/OAT DMA Engine init failed\n"); return -ENODEV; } From fa555b5026d0bf1ba7c9e645ff75e2725a982631 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 May 2024 13:54:22 +0200 Subject: [PATCH 043/272] dmaengine: fsl-edma: avoid linking both modules Kbuild does not support having a source file compiled multiple times and linked into distinct modules, or built-in and modular at the same time. For fs-edma, there are two common components that are linked into the fsl-edma.ko for Arm and PowerPC, plus the mcf-edma.ko module on Coldfire. This violates the rule for compile-testing: scripts/Makefile.build:236: drivers/dma/Makefile: fsl-edma-common.o is added to multiple modules: fsl-edma mcf-edma scripts/Makefile.build:236: drivers/dma/Makefile: fsl-edma-trace.o is added to multiple modules: fsl-edma mcf-edma I tried splitting out the common parts into a separate modules, but that adds back the complexity that a cleanup patch removed, and it gets harder with the addition of the tracepoints. As a minimal workaround, address it at the Kconfig level, by disallowing the broken configurations. Link: https://lore.kernel.org/lkml/20240110232255.1099757-1-arnd@kernel.org/ Fixes: 66aac8ea0a6c ("dmaengine: fsl-edma: clean up EXPORT_SYMBOL_GPL in fsl-edma-common.c") Signed-off-by: Arnd Bergmann Acked-by: Peng Fan Link: https://lore.kernel.org/r/20240528115440.2965975-1-arnd@kernel.org Signed-off-by: Vinod Koul --- drivers/dma/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 002a5ec806207..9fc99cfbef08c 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -394,7 +394,7 @@ config LS2X_APB_DMA config MCF_EDMA tristate "Freescale eDMA engine support, ColdFire mcf5441x SoCs" - depends on M5441x || COMPILE_TEST + depends on M5441x || (COMPILE_TEST && FSL_EDMA=n) select DMA_ENGINE select DMA_VIRTUAL_CHANNELS help From 1345a13f18370ad9e5bc98995959a27f9bd71464 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 21 May 2024 10:30:02 +0200 Subject: [PATCH 044/272] dt-bindings: dma: fsl-edma: fix dma-channels constraints dma-channels is a number, not a list. Apply proper constraints on the actual number. Fixes: 6eb439dff645 ("dt-bindings: fsl-dma: fsl-edma: add edma3 compatible string") Cc: stable@vger.kernel.org Signed-off-by: Krzysztof Kozlowski Reviewed-by: Peng Fan Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20240521083002.23262-1-krzysztof.kozlowski@linaro.org Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/dma/fsl,edma.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/dma/fsl,edma.yaml b/Documentation/devicetree/bindings/dma/fsl,edma.yaml index acfb4b2ee7a96..d54140f18d340 100644 --- a/Documentation/devicetree/bindings/dma/fsl,edma.yaml +++ b/Documentation/devicetree/bindings/dma/fsl,edma.yaml @@ -59,8 +59,8 @@ properties: - 3 dma-channels: - minItems: 1 - maxItems: 64 + minimum: 1 + maximum: 64 clocks: minItems: 1 From 5422145d0b749ad554ada772133b9b20f9fb0ec8 Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Tue, 14 May 2024 13:52:31 +0300 Subject: [PATCH 045/272] dmaengine: ioatdma: Fix missing kmem_cache_destroy() Fix missing kmem_cache_destroy() for ioat_sed_cache in ioat_exit_module(). Noticed via: ``` modprobe ioatdma rmmod ioatdma modprobe ioatdma debugfs: Directory 'ioat_sed_ent' with parent 'slab' already present! ``` Fixes: c0f28ce66ecf ("dmaengine: ioatdma: move all the init routines") Signed-off-by: Nikita Shubin Acked-by: Dave Jiang Link: https://lore.kernel.org/r/20240514-ioatdma_fixes-v1-1-2776a0913254@yadro.com Signed-off-by: Vinod Koul --- drivers/dma/ioat/init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c index cf688b0c8444c..e8f45a7fded43 100644 --- a/drivers/dma/ioat/init.c +++ b/drivers/dma/ioat/init.c @@ -1449,6 +1449,7 @@ module_init(ioat_init_module); static void __exit ioat_exit_module(void) { pci_unregister_driver(&ioat_pci_driver); + kmem_cache_destroy(ioat_sed_cache); kmem_cache_destroy(ioat_cache); } module_exit(ioat_exit_module); From d66e50beb91114f387bd798a371384b2a245e8cc Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Tue, 11 Jun 2024 18:53:17 +0100 Subject: [PATCH 046/272] KVM: arm64: FFA: Release hyp rx buffer According to the FF-A spec (Buffer states and ownership), after a producer has written into a buffer, it is "full" and now owned by the consumer. The producer won't be able to use that buffer, until the consumer hands it over with an invocation such as RX_RELEASE. It is clear in the following paragraph (Transfer of buffer ownership), that MEM_RETRIEVE_RESP is transferring the ownership from producer (in our case SPM) to consumer (hypervisor). RX_RELEASE is therefore mandatory here. It is less clear though what is happening with MEM_FRAG_TX. But this invocation, as a response to MEM_FRAG_RX writes into the same hypervisor RX buffer (see paragraph "Transmission of transaction descriptor in fragments"). Also this is matching the TF-A implementation where the RX buffer is marked "full" during a MEM_FRAG_RX. Release the RX hypervisor buffer in those two cases. This will unblock later invocations using this buffer which would otherwise fail. (RETRIEVE_REQ, MEM_FRAG_RX and PARTITION_INFO_GET). Signed-off-by: Vincent Donnefort Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20240611175317.1220842-1-vdonnefort@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/ffa.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 02746f9d0980f..efb053af331cc 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -177,6 +177,14 @@ static void ffa_retrieve_req(struct arm_smccc_res *res, u32 len) res); } +static void ffa_rx_release(struct arm_smccc_res *res) +{ + arm_smccc_1_1_smc(FFA_RX_RELEASE, + 0, 0, + 0, 0, 0, 0, 0, + res); +} + static void do_ffa_rxtx_map(struct arm_smccc_res *res, struct kvm_cpu_context *ctxt) { @@ -543,16 +551,19 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res, if (WARN_ON(offset > len || fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) { ret = FFA_RET_ABORTED; + ffa_rx_release(res); goto out_unlock; } if (len > ffa_desc_buf.len) { ret = FFA_RET_NO_MEMORY; + ffa_rx_release(res); goto out_unlock; } buf = ffa_desc_buf.buf; memcpy(buf, hyp_buffers.rx, fraglen); + ffa_rx_release(res); for (fragoff = fraglen; fragoff < len; fragoff += fraglen) { ffa_mem_frag_rx(res, handle_lo, handle_hi, fragoff); @@ -563,6 +574,7 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res, fraglen = res->a3; memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen); + ffa_rx_release(res); } ffa_mem_reclaim(res, handle_lo, handle_hi, flags); From 442b15a2d7a3f01534cb80585b84d7b60e4e2219 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 31 May 2024 18:45:33 -0700 Subject: [PATCH 047/272] selftests/openat2: fix clang build failures: -static-libasan, LOCAL_HDRS When building with clang via: make LLVM=1 -C tools/testing/selftests two distinct failures occur: 1) gcc requires -static-libasan in order to ensure that Address Sanitizer's library is the first one loaded. However, this leads to build failures on clang, when building via: make LLVM=1 -C tools/testing/selftests However, clang already does the right thing by default: it statically links the Address Sanitizer if -fsanitize is specified. Therefore, fix this by simply omitting -static-libasan for clang builds. And leave behind a comment, because the whole reason for static linking might not be obvious. 2) clang won't accept invocations of this form, but gcc will: $(CC) file1.c header2.h Fix this by using selftests/lib.mk facilities for tracking local header file dependencies: add them to LOCAL_HDRS, leaving only the .c files to be passed to the compiler. Reviewed-by: Ryan Roberts Signed-off-by: John Hubbard Reviewed-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Signed-off-by: Shuah Khan --- tools/testing/selftests/openat2/Makefile | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/openat2/Makefile b/tools/testing/selftests/openat2/Makefile index 254d676a26898..185dc76ebb5fc 100644 --- a/tools/testing/selftests/openat2/Makefile +++ b/tools/testing/selftests/openat2/Makefile @@ -1,8 +1,18 @@ # SPDX-License-Identifier: GPL-2.0-or-later -CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined -static-libasan +CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined TEST_GEN_PROGS := openat2_test resolve_test rename_attack_test +# gcc requires -static-libasan in order to ensure that Address Sanitizer's +# library is the first one loaded. However, clang already statically links the +# Address Sanitizer if -fsanitize is specified. Therefore, simply omit +# -static-libasan for clang builds. +ifeq ($(LLVM),) + CFLAGS += -static-libasan +endif + +LOCAL_HDRS += helpers.h + include ../lib.mk -$(TEST_GEN_PROGS): helpers.c helpers.h +$(TEST_GEN_PROGS): helpers.c From ed3994ac847e0d6605f248e7f6776b1d4f445f4b Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 31 May 2024 18:45:34 -0700 Subject: [PATCH 048/272] selftests/fchmodat2: fix clang build failure due to -static-libasan gcc requires -static-libasan in order to ensure that Address Sanitizer's library is the first one loaded. However, this leads to build failures on clang, when building via: make LLVM=1 -C tools/testing/selftests However, clang already does the right thing by default: it statically links the Address Sanitizer if -fsanitize is specified. Therefore, simply omit -static-libasan for clang builds. And leave behind a comment, because the whole reason for static linking might not be obvious. Cc: Ryan Roberts Signed-off-by: John Hubbard Reviewed-by: Nathan Chancellor Signed-off-by: Shuah Khan --- tools/testing/selftests/fchmodat2/Makefile | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/fchmodat2/Makefile b/tools/testing/selftests/fchmodat2/Makefile index 71ec34bf1501e..4373cea79b794 100644 --- a/tools/testing/selftests/fchmodat2/Makefile +++ b/tools/testing/selftests/fchmodat2/Makefile @@ -1,6 +1,15 @@ # SPDX-License-Identifier: GPL-2.0-or-later -CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined -static-libasan $(KHDR_INCLUDES) +CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined $(KHDR_INCLUDES) + +# gcc requires -static-libasan in order to ensure that Address Sanitizer's +# library is the first one loaded. However, clang already statically links the +# Address Sanitizer if -fsanitize is specified. Therefore, simply omit +# -static-libasan for clang builds. +ifeq ($(LLVM),) + CFLAGS += -static-libasan +endif + TEST_GEN_PROGS := fchmodat2_test include ../lib.mk From a126eca844353360ebafa9088d22865cb8e022e3 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sun, 19 May 2024 23:07:35 +0200 Subject: [PATCH 049/272] rust: avoid unused import warning in `rusttest` When compiling for the `rusttest` target, the `core::ptr` import is unused since its only use happens in the `reserve()` method which is not compiled in that target: warning: unused import: `core::ptr` --> rust/kernel/alloc/vec_ext.rs:7:5 | 7 | use core::ptr; | ^^^^^^^^^ | = note: `#[warn(unused_imports)]` on by default Thus clean it. Fixes: 97ab3e8eec0c ("rust: alloc: fix dangling pointer in VecExt::reserve()") Reviewed-by: Alice Ryhl Reviewed-by: Danilo Krummrich Link: https://lore.kernel.org/r/20240519210735.587323-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- rust/kernel/alloc/vec_ext.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/rust/kernel/alloc/vec_ext.rs b/rust/kernel/alloc/vec_ext.rs index e9a81052728a4..1297a4be32e8c 100644 --- a/rust/kernel/alloc/vec_ext.rs +++ b/rust/kernel/alloc/vec_ext.rs @@ -4,7 +4,6 @@ use super::{AllocError, Flags}; use alloc::vec::Vec; -use core::ptr; /// Extensions to [`Vec`]. pub trait VecExt: Sized { @@ -141,7 +140,11 @@ impl VecExt for Vec { // `krealloc_aligned`. A `Vec`'s `ptr` value is not guaranteed to be NULL and might be // dangling after being created with `Vec::new`. Instead, we can rely on `Vec`'s capacity // to be zero if no memory has been allocated yet. - let ptr = if cap == 0 { ptr::null_mut() } else { old_ptr }; + let ptr = if cap == 0 { + core::ptr::null_mut() + } else { + old_ptr + }; // SAFETY: `ptr` is valid because it's either NULL or comes from a previous call to // `krealloc_aligned`. We also verified that the type is not a ZST. From 3572bd5689b0812b161b40279e39ca5b66d73e88 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jun 2024 22:30:37 +0900 Subject: [PATCH 050/272] tracing: Build event generation tests only as modules The kprobes and synth event generation test modules add events and lock (get a reference) those event file reference in module init function, and unlock and delete it in module exit function. This is because those are designed for playing as modules. If we make those modules as built-in, those events are left locked in the kernel, and never be removed. This causes kprobe event self-test failure as below. [ 97.349708] ------------[ cut here ]------------ [ 97.353453] WARNING: CPU: 3 PID: 1 at kernel/trace/trace_kprobe.c:2133 kprobe_trace_self_tests_init+0x3f1/0x480 [ 97.357106] Modules linked in: [ 97.358488] CPU: 3 PID: 1 Comm: swapper/0 Not tainted 6.9.0-g699646734ab5-dirty #14 [ 97.361556] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [ 97.363880] RIP: 0010:kprobe_trace_self_tests_init+0x3f1/0x480 [ 97.365538] Code: a8 24 08 82 e9 ae fd ff ff 90 0f 0b 90 48 c7 c7 e5 aa 0b 82 e9 ee fc ff ff 90 0f 0b 90 48 c7 c7 2d 61 06 82 e9 8e fd ff ff 90 <0f> 0b 90 48 c7 c7 33 0b 0c 82 89 c6 e8 6e 03 1f ff 41 ff c7 e9 90 [ 97.370429] RSP: 0000:ffffc90000013b50 EFLAGS: 00010286 [ 97.371852] RAX: 00000000fffffff0 RBX: ffff888005919c00 RCX: 0000000000000000 [ 97.373829] RDX: ffff888003f40000 RSI: ffffffff8236a598 RDI: ffff888003f40a68 [ 97.375715] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 [ 97.377675] R10: ffffffff811c9ae5 R11: ffffffff8120c4e0 R12: 0000000000000000 [ 97.379591] R13: 0000000000000001 R14: 0000000000000015 R15: 0000000000000000 [ 97.381536] FS: 0000000000000000(0000) GS:ffff88807dcc0000(0000) knlGS:0000000000000000 [ 97.383813] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 97.385449] CR2: 0000000000000000 CR3: 0000000002244000 CR4: 00000000000006b0 [ 97.387347] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 97.389277] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 97.391196] Call Trace: [ 97.391967] [ 97.392647] ? __warn+0xcc/0x180 [ 97.393640] ? kprobe_trace_self_tests_init+0x3f1/0x480 [ 97.395181] ? report_bug+0xbd/0x150 [ 97.396234] ? handle_bug+0x3e/0x60 [ 97.397311] ? exc_invalid_op+0x1a/0x50 [ 97.398434] ? asm_exc_invalid_op+0x1a/0x20 [ 97.399652] ? trace_kprobe_is_busy+0x20/0x20 [ 97.400904] ? tracing_reset_all_online_cpus+0x15/0x90 [ 97.402304] ? kprobe_trace_self_tests_init+0x3f1/0x480 [ 97.403773] ? init_kprobe_trace+0x50/0x50 [ 97.404972] do_one_initcall+0x112/0x240 [ 97.406113] do_initcall_level+0x95/0xb0 [ 97.407286] ? kernel_init+0x1a/0x1a0 [ 97.408401] do_initcalls+0x3f/0x70 [ 97.409452] kernel_init_freeable+0x16f/0x1e0 [ 97.410662] ? rest_init+0x1f0/0x1f0 [ 97.411738] kernel_init+0x1a/0x1a0 [ 97.412788] ret_from_fork+0x39/0x50 [ 97.413817] ? rest_init+0x1f0/0x1f0 [ 97.414844] ret_from_fork_asm+0x11/0x20 [ 97.416285] [ 97.417134] irq event stamp: 13437323 [ 97.418376] hardirqs last enabled at (13437337): [] console_unlock+0x11c/0x150 [ 97.421285] hardirqs last disabled at (13437370): [] console_unlock+0x101/0x150 [ 97.423838] softirqs last enabled at (13437366): [] handle_softirqs+0x23f/0x2a0 [ 97.426450] softirqs last disabled at (13437393): [] __irq_exit_rcu+0x66/0xd0 [ 97.428850] ---[ end trace 0000000000000000 ]--- And also, since we can not cleanup dynamic_event file, ftracetest are failed too. To avoid these issues, build these tests only as modules. Link: https://lore.kernel.org/all/171811263754.85078.5877446624311852525.stgit@devnote2/ Fixes: 9fe41efaca08 ("tracing: Add synth event generation test module") Fixes: 64836248dda2 ("tracing: Add kprobe event command generation test module") Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 166ad5444eeae..721c3b221048a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1136,7 +1136,7 @@ config PREEMPTIRQ_DELAY_TEST config SYNTH_EVENT_GEN_TEST tristate "Test module for in-kernel synthetic event generation" - depends on SYNTH_EVENTS + depends on SYNTH_EVENTS && m help This option creates a test module to check the base functionality of in-kernel synthetic event definition and @@ -1149,7 +1149,7 @@ config SYNTH_EVENT_GEN_TEST config KPROBE_EVENT_GEN_TEST tristate "Test module for in-kernel kprobe event generation" - depends on KPROBE_EVENTS + depends on KPROBE_EVENTS && m help This option creates a test module to check the base functionality of in-kernel kprobe event definition. From 0941772342d59e48733131ac3a202fa1a4d832e9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 11 Jun 2024 18:58:16 +0200 Subject: [PATCH 051/272] wifi: cfg80211: wext: set ssids=NULL for passive scans In nl80211, we always set the ssids of a scan request to NULL when n_ssids==0 (passive scan). Drivers have relied on this behaviour in the past, so we fixed it in 6 GHz scan requests as well, and added a warning so we'd have assurance the API would always be called that way. syzbot found that wext doesn't ensure that, so we reach the check and trigger the warning. Fix the wext code to set the ssids pointer to NULL when there are none. Reported-by: syzbot+cd6135193ba6bb9ad158@syzkaller.appspotmail.com Fixes: f7a8b10bfd61 ("wifi: cfg80211: fix 6 GHz scan request building") Signed-off-by: Johannes Berg --- net/wireless/scan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 2f2a3163968a7..d7485e26f4fc2 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -3493,8 +3493,10 @@ int cfg80211_wext_siwscan(struct net_device *dev, memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len); creq->ssids[0].ssid_len = wreq->essid_len; } - if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE) + if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE) { + creq->ssids = NULL; creq->n_ssids = 0; + } } for (i = 0; i < NUM_NL80211_BANDS; i++) From 6ef09cdc5ba0f93826c09d810c141a8d103a80fc Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 31 May 2024 06:20:10 +0300 Subject: [PATCH 052/272] wifi: cfg80211: wext: add extra SIOCSIWSCAN data check In 'cfg80211_wext_siwscan()', add extra check whether number of channels passed via 'ioctl(sock, SIOCSIWSCAN, ...)' doesn't exceed IW_MAX_FREQUENCIES and reject invalid request with -EINVAL otherwise. Reported-by: syzbot+253cd2d2491df77c93ac@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=253cd2d2491df77c93ac Signed-off-by: Dmitry Antipov Link: https://msgid.link/20240531032010.451295-1-dmantipov@yandex.ru Signed-off-by: Johannes Berg --- net/wireless/scan.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index d7485e26f4fc2..0222ede0feb60 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -3416,10 +3416,14 @@ int cfg80211_wext_siwscan(struct net_device *dev, wiphy = &rdev->wiphy; /* Determine number of channels, needed to allocate creq */ - if (wreq && wreq->num_channels) + if (wreq && wreq->num_channels) { + /* Passed from userspace so should be checked */ + if (unlikely(wreq->num_channels > IW_MAX_FREQUENCIES)) + return -EINVAL; n_channels = wreq->num_channels; - else + } else { n_channels = ieee80211_get_num_supported_channels(wiphy); + } creq = kzalloc(sizeof(*creq) + sizeof(struct cfg80211_ssid) + n_channels * sizeof(void *), From d792011b6c282bfb787eb2893538e5e336d5e982 Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Wed, 5 Jun 2024 14:05:04 +0300 Subject: [PATCH 053/272] wifi: iwlwifi: mvm: unlock mvm mutex Unlock the mvm mutex before returning from a function with the mutex locked. Fixes: a1efeb823084 ("wifi: iwlwifi: mvm: Block EMLSR when a p2p/softAP vif is active") Signed-off-by: Shaul Triebitz Signed-off-by: Miri Korenblit Link: https://msgid.link/20240605140327.96cb956db4af.Ib468cbad38959910977b5581f6111ab0afae9880@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/time-event.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c index 8ee4498f42455..31bc80cdcb7d5 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c @@ -1238,6 +1238,7 @@ void iwl_mvm_stop_roc(struct iwl_mvm *mvm, struct ieee80211_vif *vif) if (te_data->id >= SESSION_PROTECT_CONF_MAX_ID) { IWL_DEBUG_TE(mvm, "No remain on channel event\n"); + mutex_unlock(&mvm->mutex); return; } @@ -1253,6 +1254,7 @@ void iwl_mvm_stop_roc(struct iwl_mvm *mvm, struct ieee80211_vif *vif) te_data = iwl_mvm_get_roc_te(mvm); if (!te_data) { IWL_WARN(mvm, "No remain on channel event\n"); + mutex_unlock(&mvm->mutex); return; } From 4c2bed6042fb6aca1d1d4f291f85461b1d5ac08c Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Wed, 5 Jun 2024 14:05:05 +0300 Subject: [PATCH 054/272] wifi: iwlwifi: mvm: fix ROC version check For using the ROC command, check that the ROC version is *greater or equal* to 3, rather than *equal* to 3. The ROC version was added to the TLV starting from version 3. Fixes: 67ac248e4db0 ("wifi: iwlwifi: mvm: implement ROC version 3") Signed-off-by: Shaul Triebitz Signed-off-by: Miri Korenblit Link: https://msgid.link/20240605140327.93d86cd188ad.Iceadef5a2f3cfa4a127e94a0405eba8342ec89c1@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index de9f0b4465456..18ce060df9b5b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -4795,7 +4795,7 @@ static int iwl_mvm_roc_station(struct iwl_mvm *mvm, if (fw_ver == IWL_FW_CMD_VER_UNKNOWN) { ret = iwl_mvm_send_aux_roc_cmd(mvm, channel, vif, duration); - } else if (fw_ver == 3) { + } else if (fw_ver >= 3) { ret = iwl_mvm_roc_add_cmd(mvm, channel, vif, duration, ROC_ACTIVITY_HOTSPOT); } else { From fcc356020a0171106c9ba524ba05a6792668451e Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Wed, 5 Jun 2024 14:07:38 +0300 Subject: [PATCH 055/272] wifi: iwlwifi: scan: correctly check if PSC listen period is needed The flags variable is incorrectly checked while it is still cleared and has not been assigned any value yet. Fix it. Fixes: a615323f7f90 ("wifi: iwlwifi: mvm: always apply 6 GHz probe limitations") Signed-off-by: Ayala Beker Reviewed-by: Benjamin Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240605140556.291c33f9a283.Id651fe69828aebce177b49b2316c5780906f1b37@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c index b5f664ae5a17d..e975f5ff17b5d 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c @@ -1830,7 +1830,7 @@ iwl_mvm_umac_scan_cfg_channels_v7_6g(struct iwl_mvm *mvm, */ if (!iwl_mvm_is_scan_fragmented(params->type)) { if (!cfg80211_channel_is_psc(params->channels[i]) || - flags & IWL_UHB_CHAN_CFG_FLAG_PSC_CHAN_NO_LISTEN) { + psc_no_listen) { if (unsolicited_probe_on_chan) { max_s_ssids = 2; max_bssids = 6; From 7d09e17c0415fe6d946044c7e70bce31cda952ec Mon Sep 17 00:00:00 2001 From: Remi Pommarel Date: Sat, 18 May 2024 18:07:33 +0200 Subject: [PATCH 056/272] wifi: mac80211: Recalc offload when monitor stop When a monitor interface is started, ieee80211_recalc_offload() is called and 802.11 encapsulation offloading support get disabled so monitor interface could get native wifi frames directly. But when this interface is stopped there is no need to keep the 802.11 encpasulation offloading off. This call ieee80211_recalc_offload() when monitor interface is stopped so 802.11 encapsulation offloading gets re-activated if possible. Fixes: 6aea26ce5a4c ("mac80211: rework tx encapsulation offload API") Signed-off-by: Remi Pommarel Link: https://msgid.link/840baab454f83718e6e16fd836ac597d924e85b9.1716048326.git.repk@triplefau.lt Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index dc42902e26935..0c54554bf761b 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -686,6 +686,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do ieee80211_del_virtual_monitor(local); ieee80211_recalc_idle(local); + ieee80211_recalc_offload(local); if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)) break; From 4cac29b846f38d5f0654cdfff5c5bfc37305081c Mon Sep 17 00:00:00 2001 From: Kalle Niemi Date: Wed, 12 Jun 2024 14:42:34 +0300 Subject: [PATCH 057/272] regulator: bd71815: fix ramp values Ramp values are inverted. This caused wrong values written to register when ramp values were defined in device tree. Invert values in table to fix this. Signed-off-by: Kalle Niemi Fixes: 1aad39001e85 ("regulator: Support ROHM BD71815 regulators") Reviewed-by: Matti Vaittinen Link: https://lore.kernel.org/r/ZmmJXtuVJU6RgQAH@latitude5580 Signed-off-by: Mark Brown --- drivers/regulator/bd71815-regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/bd71815-regulator.c b/drivers/regulator/bd71815-regulator.c index 26192d55a6858..79fbb45297f6b 100644 --- a/drivers/regulator/bd71815-regulator.c +++ b/drivers/regulator/bd71815-regulator.c @@ -256,7 +256,7 @@ static int buck12_set_hw_dvs_levels(struct device_node *np, * 10: 2.50mV/usec 10mV 4uS * 11: 1.25mV/usec 10mV 8uS */ -static const unsigned int bd7181x_ramp_table[] = { 1250, 2500, 5000, 10000 }; +static const unsigned int bd7181x_ramp_table[] = { 10000, 5000, 2500, 1250 }; static int bd7181x_led_set_current_limit(struct regulator_dev *rdev, int min_uA, int max_uA) From 72cacd06e47d86d89b0e7179fbc9eb3a0f39cd93 Mon Sep 17 00:00:00 2001 From: Julien Panis Date: Tue, 4 Jun 2024 18:46:58 +0200 Subject: [PATCH 058/272] thermal/drivers/mediatek/lvts_thermal: Return error in case of invalid efuse data This patch prevents from registering thermal entries and letting the driver misbehave if efuse data is invalid. A device is not properly calibrated if the golden temperature is zero. Fixes: f5f633b18234 ("thermal/drivers/mediatek: Add the Low Voltage Thermal Sensor driver") Signed-off-by: Julien Panis Reviewed-by: Nicolas Pitre Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20240604-mtk-thermal-calib-check-v2-1-8f258254051d@baylibre.com Signed-off-by: Daniel Lezcano --- drivers/thermal/mediatek/lvts_thermal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/mediatek/lvts_thermal.c b/drivers/thermal/mediatek/lvts_thermal.c index 82c355c466cfe..819ed0110f3e7 100644 --- a/drivers/thermal/mediatek/lvts_thermal.c +++ b/drivers/thermal/mediatek/lvts_thermal.c @@ -769,7 +769,11 @@ static int lvts_golden_temp_init(struct device *dev, u8 *calib, */ gt = (((u32 *)calib)[0] >> lvts_data->gt_calib_bit_offset) & 0xff; - if (gt && gt < LVTS_GOLDEN_TEMP_MAX) + /* A zero value for gt means that device has invalid efuse data */ + if (!gt) + return -ENODATA; + + if (gt < LVTS_GOLDEN_TEMP_MAX) golden_temp = gt; golden_temp_offset = golden_temp * 500 + lvts_data->temp_offset; From 0057222c45140830a7bf55e92fb67f84a2814f67 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 18 Apr 2024 01:07:33 +0100 Subject: [PATCH 059/272] regulator: axp20x: AXP717: fix LDO supply rails and off-by-ones The X-Powers AXP717 PMIC has separate input supply pins for each group of LDOs, so they are not all using the same DCDC1 input, as described currently. Replace the "supply" member of each LDO description with the respective group supply name, so that the supply dependencies can be correctly described in the devicetree. Also fix two off-by-ones in the regulator macros, after some double checking the numbers against the datasheet. This uncovered a bug in the datasheet: add a comment to document this. Fixes: d2ac3df75c3a ("regulator: axp20x: add support for the AXP717") Signed-off-by: Andre Przywara Reviewed-by: John Watts Link: https://lore.kernel.org/r/20240418000736.24338-3-andre.przywara@arm.com Signed-off-by: Mark Brown --- drivers/regulator/axp20x-regulator.c | 33 ++++++++++++++++------------ 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c index 34fcdd82b2eaa..f3c447ecdc3bf 100644 --- a/drivers/regulator/axp20x-regulator.c +++ b/drivers/regulator/axp20x-regulator.c @@ -140,7 +140,7 @@ #define AXP717_DCDC1_NUM_VOLTAGES 88 #define AXP717_DCDC2_NUM_VOLTAGES 107 -#define AXP717_DCDC3_NUM_VOLTAGES 104 +#define AXP717_DCDC3_NUM_VOLTAGES 103 #define AXP717_DCDC_V_OUT_MASK GENMASK(6, 0) #define AXP717_LDO_V_OUT_MASK GENMASK(4, 0) @@ -763,10 +763,15 @@ static const struct linear_range axp717_dcdc1_ranges[] = { REGULATOR_LINEAR_RANGE(1220000, 71, 87, 20000), }; +/* + * The manual says that the last voltage is 3.4V, encoded as 0b1101011 (107), + * but every other method proves that this is wrong, so it's really 106 that + * programs the final 3.4V. + */ static const struct linear_range axp717_dcdc2_ranges[] = { REGULATOR_LINEAR_RANGE(500000, 0, 70, 10000), REGULATOR_LINEAR_RANGE(1220000, 71, 87, 20000), - REGULATOR_LINEAR_RANGE(1600000, 88, 107, 100000), + REGULATOR_LINEAR_RANGE(1600000, 88, 106, 100000), }; static const struct linear_range axp717_dcdc3_ranges[] = { @@ -790,40 +795,40 @@ static const struct regulator_desc axp717_regulators[] = { AXP_DESC(AXP717, DCDC4, "dcdc4", "vin4", 1000, 3700, 100, AXP717_DCDC4_CONTROL, AXP717_DCDC_V_OUT_MASK, AXP717_DCDC_OUTPUT_CONTROL, BIT(3)), - AXP_DESC(AXP717, ALDO1, "aldo1", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, ALDO1, "aldo1", "aldoin", 500, 3500, 100, AXP717_ALDO1_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(0)), - AXP_DESC(AXP717, ALDO2, "aldo2", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, ALDO2, "aldo2", "aldoin", 500, 3500, 100, AXP717_ALDO2_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(1)), - AXP_DESC(AXP717, ALDO3, "aldo3", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, ALDO3, "aldo3", "aldoin", 500, 3500, 100, AXP717_ALDO3_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(2)), - AXP_DESC(AXP717, ALDO4, "aldo4", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, ALDO4, "aldo4", "aldoin", 500, 3500, 100, AXP717_ALDO4_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(3)), - AXP_DESC(AXP717, BLDO1, "bldo1", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, BLDO1, "bldo1", "bldoin", 500, 3500, 100, AXP717_BLDO1_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(4)), - AXP_DESC(AXP717, BLDO2, "bldo2", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, BLDO2, "bldo2", "bldoin", 500, 3500, 100, AXP717_BLDO2_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(5)), - AXP_DESC(AXP717, BLDO3, "bldo3", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, BLDO3, "bldo3", "bldoin", 500, 3500, 100, AXP717_BLDO3_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(6)), - AXP_DESC(AXP717, BLDO4, "bldo4", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, BLDO4, "bldo4", "bldoin", 500, 3500, 100, AXP717_BLDO4_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO0_OUTPUT_CONTROL, BIT(7)), - AXP_DESC(AXP717, CLDO1, "cldo1", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, CLDO1, "cldo1", "cldoin", 500, 3500, 100, AXP717_CLDO1_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO1_OUTPUT_CONTROL, BIT(0)), - AXP_DESC(AXP717, CLDO2, "cldo2", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, CLDO2, "cldo2", "cldoin", 500, 3500, 100, AXP717_CLDO2_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO1_OUTPUT_CONTROL, BIT(1)), - AXP_DESC(AXP717, CLDO3, "cldo3", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, CLDO3, "cldo3", "cldoin", 500, 3500, 100, AXP717_CLDO3_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO1_OUTPUT_CONTROL, BIT(2)), - AXP_DESC(AXP717, CLDO4, "cldo4", "vin1", 500, 3500, 100, + AXP_DESC(AXP717, CLDO4, "cldo4", "cldoin", 500, 3500, 100, AXP717_CLDO4_CONTROL, AXP717_LDO_V_OUT_MASK, AXP717_LDO1_OUTPUT_CONTROL, BIT(3)), AXP_DESC(AXP717, CPUSLDO, "cpusldo", "vin1", 500, 1400, 50, From 26ba7c3f139f843bf46ed0779e30d84641767959 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 12 Jun 2024 15:53:29 -0700 Subject: [PATCH 060/272] MAINTAINERS: mailmap: Update Stanislav's email address Moving to personal address for upstream work. Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20240612225334.41869-1-sdf@google.com Signed-off-by: Alexei Starovoitov --- .mailmap | 1 + MAINTAINERS | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 9af91bd3584bd..6898017339756 100644 --- a/.mailmap +++ b/.mailmap @@ -606,6 +606,7 @@ Simon Kelley Sricharan Ramabadhran Srinivas Ramana Sriram R +Stanislav Fomichev Stefan Wahren Stéphane Witzmann Stephen Hemminger diff --git a/MAINTAINERS b/MAINTAINERS index aacccb376c28a..4582af09f2da0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3980,7 +3980,7 @@ R: Song Liu R: Yonghong Song R: John Fastabend R: KP Singh -R: Stanislav Fomichev +R: Stanislav Fomichev R: Hao Luo R: Jiri Olsa L: bpf@vger.kernel.org From 6e5aee08bd2517397c9572243a816664f2ead547 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Thu, 13 Jun 2024 10:17:09 +0200 Subject: [PATCH 061/272] Revert "MIPS: pci: lantiq: restore reset gpio polarity" This reverts commit 277a0363120276645ae598d8d5fea7265e076ae9. While fixing old boards with broken DTs, this change will break newer ones with correct gpio polarity annotation. Signed-off-by: Thomas Bogendoerfer --- arch/mips/pci/pci-lantiq.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/mips/pci/pci-lantiq.c b/arch/mips/pci/pci-lantiq.c index 0844db34022e4..68a8cefed420b 100644 --- a/arch/mips/pci/pci-lantiq.c +++ b/arch/mips/pci/pci-lantiq.c @@ -124,14 +124,14 @@ static int ltq_pci_startup(struct platform_device *pdev) clk_disable(clk_external); /* setup reset gpio used by pci */ - reset_gpio = devm_gpiod_get_optional(&pdev->dev, "reset", GPIOD_ASIS); + reset_gpio = devm_gpiod_get_optional(&pdev->dev, "reset", + GPIOD_OUT_LOW); error = PTR_ERR_OR_ZERO(reset_gpio); if (error) { dev_err(&pdev->dev, "failed to request gpio: %d\n", error); return error; } gpiod_set_consumer_name(reset_gpio, "pci_reset"); - gpiod_direction_output(reset_gpio, 1); /* enable auto-switching between PCI and EBU */ ltq_pci_w32(0xa, PCI_CR_CLK_CTRL); @@ -194,10 +194,10 @@ static int ltq_pci_startup(struct platform_device *pdev) /* toggle reset pin */ if (reset_gpio) { - gpiod_set_value_cansleep(reset_gpio, 0); + gpiod_set_value_cansleep(reset_gpio, 1); wmb(); mdelay(1); - gpiod_set_value_cansleep(reset_gpio, 1); + gpiod_set_value_cansleep(reset_gpio, 0); } return 0; } From ea5f8c4cffcd8a6b62b3a3bd5008275218c9d02a Mon Sep 17 00:00:00 2001 From: Andy Chi Date: Wed, 5 Jun 2024 17:22:41 +0800 Subject: [PATCH 062/272] ALSA: hda/realtek: fix mute/micmute LEDs don't work for ProBook 445/465 G11. HP ProBook 445/465 G11 needs ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF quirk to make mic-mute/audio-mute working. Signed-off-by: Andy Chi Cc: Link: https://lore.kernel.org/r/20240605092243.41963-1-andy.chi@canonical.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index aa76d1c885895..54a52c1480707 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10194,6 +10194,10 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8c70, "HP EliteBook 835 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c71, "HP EliteBook 845 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c72, "HP EliteBook 865 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8c7b, "HP ProBook 445 G11", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8c7c, "HP ProBook 445 G11", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8c7d, "HP ProBook 465 G11", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8c7e, "HP ProBook 465 G11", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8c89, "HP ProBook 460 G11", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c8a, "HP EliteBook 630", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c8c, "HP EliteBook 660", ALC236_FIXUP_HP_GPIO_LED), From 86a433862912f52597263aa224a9ed82bcd533bf Mon Sep 17 00:00:00 2001 From: Edson Juliano Drosdeck Date: Wed, 5 Jun 2024 12:39:23 -0300 Subject: [PATCH 063/272] ALSA: hda/realtek: Limit mic boost on N14AP7 The internal mic boost on the N14AP7 is too high. Fix this by applying the ALC269_FIXUP_LIMIT_INT_MIC_BOOST fixup to the machine to limit the gain. Signed-off-by: Edson Juliano Drosdeck Cc: Link: https://lore.kernel.org/r/20240605153923.2837-1-edson.drosdeck@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 54a52c1480707..da54300279af8 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10585,6 +10585,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1b7d, 0xa831, "Ordissimo EVE2 ", ALC269VB_FIXUP_ORDISSIMO_EVE2), /* Also known as Malata PC-B1303 */ SND_PCI_QUIRK(0x1c06, 0x2013, "Lemote A1802", ALC269_FIXUP_LEMOTE_A1802), SND_PCI_QUIRK(0x1c06, 0x2015, "Lemote A190X", ALC269_FIXUP_LEMOTE_A190X), + SND_PCI_QUIRK(0x1c6c, 0x122a, "Positivo N14AP7", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x1c6c, 0x1251, "Positivo N14KP6-TG", ALC288_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1d05, 0x1132, "TongFang PHxTxX1", ALC256_FIXUP_SET_COEF_DEFAULTS), SND_PCI_QUIRK(0x1d05, 0x1096, "TongFang GMxMRxx", ALC269_FIXUP_NO_SHUTUP), From e799bdf51d54bebaf939fdb655aad424e624c1b1 Mon Sep 17 00:00:00 2001 From: "Dustin L. Howett" Date: Wed, 5 Jun 2024 12:01:32 -0500 Subject: [PATCH 064/272] ALSA: hda/realtek: Remove Framework Laptop 16 from quirks The Framework Laptop 16 does not have a combination headphone/headset 3.5mm jack; however, applying the pincfg from the Laptop 13 (nid=0x19) erroneously informs hda that the node is present. Fixes: 8804fa04a492 ("ALSA: hda/realtek: Add Framework laptop 16 to quirks") Signed-off-by: Dustin L. Howett Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20240605-alsa-hda-realtek-remove-framework-laptop-16-from-quirks-v1-1-11d47fe8ec4d@howett.net Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index da54300279af8..8408e0b0730a6 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10610,7 +10610,6 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x8086, 0x2081, "Intel NUC 10", ALC256_FIXUP_INTEL_NUC10), SND_PCI_QUIRK(0x8086, 0x3038, "Intel NUC 13", ALC295_FIXUP_CHROME_BOOK), SND_PCI_QUIRK(0xf111, 0x0001, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), - SND_PCI_QUIRK(0xf111, 0x0005, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0xf111, 0x0006, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), #if 0 From 82f3daed2d3590fa286a02301573a183dd902a0f Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Thu, 6 Jun 2024 14:03:48 +0100 Subject: [PATCH 065/272] ALSA: hda: cs35l41: Support Lenovo Thinkbook 16P Gen 5 This laptop does not contain _DSD so needs to be supported using the configuration table. Signed-off-by: Stefan Binding Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240606130351.333495-2-sbinding@opensource.cirrus.com --- sound/pci/hda/cs35l41_hda_property.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/cs35l41_hda_property.c b/sound/pci/hda/cs35l41_hda_property.c index 6a7a6d486916a..046a94250683d 100644 --- a/sound/pci/hda/cs35l41_hda_property.c +++ b/sound/pci/hda/cs35l41_hda_property.c @@ -128,6 +128,8 @@ static const struct cs35l41_config cs35l41_config_table[] = { { "17AA38B5", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, { "17AA38B6", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, { "17AA38B7", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, + { "17AA38F9", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, + { "17AA38FA", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, {} }; @@ -529,6 +531,8 @@ static const struct cs35l41_prop_model cs35l41_prop_model_table[] = { { "CSC3551", "17AA38B5", generic_dsd_config }, { "CSC3551", "17AA38B6", generic_dsd_config }, { "CSC3551", "17AA38B7", generic_dsd_config }, + { "CSC3551", "17AA38F9", generic_dsd_config }, + { "CSC3551", "17AA38FA", generic_dsd_config }, {} }; From b32f92d1af3789038f03c2899e3be0d00b43faf2 Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Thu, 6 Jun 2024 14:03:49 +0100 Subject: [PATCH 066/272] ALSA: hda: cs35l41: Support Lenovo Thinkbook 13x Gen 4 This laptop does not contain _DSD so needs to be supported using the configuration table. Signed-off-by: Stefan Binding Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240606130351.333495-3-sbinding@opensource.cirrus.com --- sound/pci/hda/cs35l41_hda_property.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/cs35l41_hda_property.c b/sound/pci/hda/cs35l41_hda_property.c index 046a94250683d..51998d1c72ff1 100644 --- a/sound/pci/hda/cs35l41_hda_property.c +++ b/sound/pci/hda/cs35l41_hda_property.c @@ -128,6 +128,8 @@ static const struct cs35l41_config cs35l41_config_table[] = { { "17AA38B5", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, { "17AA38B6", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, { "17AA38B7", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, + { "17AA38C7", 4, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT }, 0, 2, -1, 1000, 4500, 24 }, + { "17AA38C8", 4, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT }, 0, 2, -1, 1000, 4500, 24 }, { "17AA38F9", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, { "17AA38FA", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, {} @@ -531,6 +533,8 @@ static const struct cs35l41_prop_model cs35l41_prop_model_table[] = { { "CSC3551", "17AA38B5", generic_dsd_config }, { "CSC3551", "17AA38B6", generic_dsd_config }, { "CSC3551", "17AA38B7", generic_dsd_config }, + { "CSC3551", "17AA38C7", generic_dsd_config }, + { "CSC3551", "17AA38C8", generic_dsd_config }, { "CSC3551", "17AA38F9", generic_dsd_config }, { "CSC3551", "17AA38FA", generic_dsd_config }, {} From 75f2ea939b5c694b36aad8ef823a2f9bcf7b3d7d Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Thu, 6 Jun 2024 14:03:50 +0100 Subject: [PATCH 067/272] ALSA: hda/realtek: Support Lenovo Thinkbook 16P Gen 5 Add support for this laptop, which uses CS35L41 HDA amps. The laptop does not contain valid _DSD for these amps, so requires entries into the CS35L41 configuration table to function correctly. [ fixed to lower hex numbers in quirk entries -- tiwai ] Signed-off-by: Stefan Binding Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240606130351.333495-4-sbinding@opensource.cirrus.com --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 8408e0b0730a6..2320f04eca5aa 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10548,6 +10548,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x38cd, "Y790 VECO DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38d2, "Lenovo Yoga 9 14IMH9", ALC287_FIXUP_YOGA9_14IMH9_BASS_SPK_PIN), SND_PCI_QUIRK(0x17aa, 0x38d7, "Lenovo Yoga 9 14IMH9", ALC287_FIXUP_YOGA9_14IMH9_BASS_SPK_PIN), + SND_PCI_QUIRK(0x17aa, 0x38f9, "Thinkbook 16P Gen5", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x17aa, 0x38fa, "Thinkbook 16P Gen5", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x3902, "Lenovo E50-80", ALC269_FIXUP_DMIC_THINKPAD_ACPI), SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC), SND_PCI_QUIRK(0x17aa, 0x3978, "Lenovo B50-70", ALC269_FIXUP_DMIC_THINKPAD_ACPI), From 4ecb16d9250e6fcf8818572bf317b6adae16515b Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Thu, 6 Jun 2024 14:03:51 +0100 Subject: [PATCH 068/272] ALSA: hda/realtek: Support Lenovo Thinkbook 13x Gen 4 Add support for this laptop, which uses CS35L41 HDA amps. The laptop does not contain valid _DSD for these amps, so requires entries into the CS35L41 configuration table to function correctly. Signed-off-by: Stefan Binding Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240606130351.333495-5-sbinding@opensource.cirrus.com --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 2320f04eca5aa..79736c8782a31 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10544,6 +10544,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x38be, "Yoga S980-14.5 proX YC Dual", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38bf, "Yoga S980-14.5 proX LX Dual", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38c3, "Y980 DUAL", ALC287_FIXUP_TAS2781_I2C), + SND_PCI_QUIRK(0x17aa, 0x38c7, "Thinkbook 13x Gen 4", ALC287_FIXUP_CS35L41_I2C_4), + SND_PCI_QUIRK(0x17aa, 0x38c8, "Thinkbook 13x Gen 4", ALC287_FIXUP_CS35L41_I2C_4), SND_PCI_QUIRK(0x17aa, 0x38cb, "Y790 YG DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38cd, "Y790 VECO DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38d2, "Lenovo Yoga 9 14IMH9", ALC287_FIXUP_YOGA9_14IMH9_BASS_SPK_PIN), From 2646b43910c0e6d7f4ad535919b44b88f98c688d Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Fri, 7 Jun 2024 09:00:21 +0300 Subject: [PATCH 069/272] ALSA/hda: intel-dsp-config: Document AVS as dsp_driver option dsp_driver=4 will force the AVS driver stack to be used, it is better to docuement this. Fixes: 1affc44ea5dd ("ASoC: Intel: avs: PCI driver implementation") Signed-off-by: Peter Ujfalusi Reviewed-by: Cezary Rojewski Link: https://lore.kernel.org/r/20240607060021.11503-1-peter.ujfalusi@linux.intel.com Signed-off-by: Takashi Iwai --- sound/hda/intel-dsp-config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/hda/intel-dsp-config.c b/sound/hda/intel-dsp-config.c index 537863447358e..478d2b50c571d 100644 --- a/sound/hda/intel-dsp-config.c +++ b/sound/hda/intel-dsp-config.c @@ -18,7 +18,7 @@ static int dsp_driver; module_param(dsp_driver, int, 0444); -MODULE_PARM_DESC(dsp_driver, "Force the DSP driver for Intel DSP (0=auto, 1=legacy, 2=SST, 3=SOF)"); +MODULE_PARM_DESC(dsp_driver, "Force the DSP driver for Intel DSP (0=auto, 1=legacy, 2=SST, 3=SOF, 4=AVS)"); #define FLAG_SST BIT(0) #define FLAG_SOF BIT(1) From bc69ad74867dba1377abe14356c94a946d9837a3 Mon Sep 17 00:00:00 2001 From: En-Wei Wu Date: Thu, 30 May 2024 22:21:31 +0800 Subject: [PATCH 070/272] ice: avoid IRQ collision to fix init failure on ACPI S3 resume A bug in https://bugzilla.kernel.org/show_bug.cgi?id=218906 describes that irdma would break and report hardware initialization failed after suspend/resume with Intel E810 NIC (tested on 6.9.0-rc5). The problem is caused due to the collision between the irq numbers requested in irdma and the irq numbers requested in other drivers after suspend/resume. The irq numbers used by irdma are derived from ice's ice_pf->msix_entries which stores mappings between MSI-X index and Linux interrupt number. It's supposed to be cleaned up when suspend and rebuilt in resume but it's not, causing irdma using the old irq numbers stored in the old ice_pf->msix_entries to request_irq() when resume. And eventually collide with other drivers. This patch fixes this problem. On suspend, we call ice_deinit_rdma() to clean up the ice_pf->msix_entries (and free the MSI-X vectors used by irdma if we've dynamically allocated them). On resume, we call ice_init_rdma() to rebuild the ice_pf->msix_entries (and allocate the MSI-X vectors if we would like to dynamically allocate them). Fixes: f9f5301e7e2d ("ice: Register auxiliary device to provide RDMA") Tested-by: Cyrus Lien Signed-off-by: En-Wei Wu Reviewed-by: Wojciech Drewek Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 1b61ca3a6eb6e..45d850514f4c3 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5564,7 +5564,7 @@ static int ice_suspend(struct device *dev) */ disabled = ice_service_task_stop(pf); - ice_unplug_aux_dev(pf); + ice_deinit_rdma(pf); /* Already suspended?, then there is nothing to do */ if (test_and_set_bit(ICE_SUSPENDED, pf->state)) { @@ -5644,6 +5644,11 @@ static int ice_resume(struct device *dev) if (ret) dev_err(dev, "Cannot restore interrupt scheme: %d\n", ret); + ret = ice_init_rdma(pf); + if (ret) + dev_err(dev, "Reinitialize RDMA during resume failed: %d\n", + ret); + clear_bit(ICE_DOWN, pf->state); /* Now perform PF reset and rebuild */ reset_type = ICE_RESET_PFR; From aeccadb24d9dacdde673a0f68f0a9135c6be4993 Mon Sep 17 00:00:00 2001 From: Paul Greenwalt Date: Thu, 30 May 2024 13:06:17 -0400 Subject: [PATCH 071/272] ice: fix 200G link speed message log Commit 24407a01e57c ("ice: Add 200G speed/phy type use") added support for 200G PHY speeds, but did not include 200G link speed message support. As a result the driver incorrectly reports Unknown for 200G link speed. Fix this by adding 200G support to ice_print_link_msg(). Fixes: 24407a01e57c ("ice: Add 200G speed/phy type use") Reviewed-by: Michal Swiatkowski Signed-off-by: Paul Greenwalt Reviewed-by: Jesse Brandeburg Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 45d850514f4c3..1766230abfff6 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -805,6 +805,9 @@ void ice_print_link_msg(struct ice_vsi *vsi, bool isup) } switch (vsi->port_info->phy.link_info.link_speed) { + case ICE_AQ_LINK_SPEED_200GB: + speed = "200 G"; + break; case ICE_AQ_LINK_SPEED_100GB: speed = "100 G"; break; From a27f6ac9d404ea84196639dcc456f969ef813c0f Mon Sep 17 00:00:00 2001 From: Wojciech Drewek Date: Tue, 4 Jun 2024 14:55:14 +0200 Subject: [PATCH 072/272] ice: implement AQ download pkg retry ice_aqc_opc_download_pkg (0x0C40) AQ sporadically returns error due to FW issue. Fix this by retrying five times before moving to Safe Mode. Sleep for 20 ms before retrying. This was tested with the 4.40 firmware. Fixes: c76488109616 ("ice: Implement Dynamic Device Personalization (DDP) download") Reviewed-by: Michal Swiatkowski Signed-off-by: Wojciech Drewek Reviewed-by: Brett Creeley Reviewed-by: Przemek Kitszel Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ddp.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ddp.c b/drivers/net/ethernet/intel/ice/ice_ddp.c index ce5034ed2b240..f182179529b7d 100644 --- a/drivers/net/ethernet/intel/ice/ice_ddp.c +++ b/drivers/net/ethernet/intel/ice/ice_ddp.c @@ -1339,6 +1339,7 @@ ice_dwnld_cfg_bufs_no_lock(struct ice_hw *hw, struct ice_buf *bufs, u32 start, for (i = 0; i < count; i++) { bool last = false; + int try_cnt = 0; int status; bh = (struct ice_buf_hdr *)(bufs + start + i); @@ -1346,8 +1347,26 @@ ice_dwnld_cfg_bufs_no_lock(struct ice_hw *hw, struct ice_buf *bufs, u32 start, if (indicate_last) last = ice_is_last_download_buffer(bh, i, count); - status = ice_aq_download_pkg(hw, bh, ICE_PKG_BUF_SIZE, last, - &offset, &info, NULL); + while (1) { + status = ice_aq_download_pkg(hw, bh, ICE_PKG_BUF_SIZE, + last, &offset, &info, + NULL); + if (hw->adminq.sq_last_status != ICE_AQ_RC_ENOSEC && + hw->adminq.sq_last_status != ICE_AQ_RC_EBADSIG) + break; + + try_cnt++; + + if (try_cnt == 5) + break; + + msleep(20); + } + + if (try_cnt) + dev_dbg(ice_hw_to_dev(hw), + "ice_aq_download_pkg number of retries: %d\n", + try_cnt); /* Save AQ status from download package */ if (status) { From 92424801261d1564a0bb759da3cf3ccd69fdf5a2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 13 Jun 2024 13:53:08 +0200 Subject: [PATCH 073/272] bpf: Fix reg_set_min_max corruption of fake_reg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Juan reported that after doing some changes to buzzer [0] and implementing a new fuzzing strategy guided by coverage, they noticed the following in one of the probes: [...] 13: (79) r6 = *(u64 *)(r0 +0) ; R0=map_value(ks=4,vs=8) R6_w=scalar() 14: (b7) r0 = 0 ; R0_w=0 15: (b4) w0 = -1 ; R0_w=0xffffffff 16: (74) w0 >>= 1 ; R0_w=0x7fffffff 17: (5c) w6 &= w0 ; R0_w=0x7fffffff R6_w=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0; 0x7fffffff)) 18: (44) w6 |= 2 ; R6_w=scalar(smin=umin=smin32=umin32=2,smax=umax=umax32=0x7fffffff,var_off=(0x2; 0x7ffffffd)) 19: (56) if w6 != 0x7ffffffd goto pc+1 REG INVARIANTS VIOLATION (true_reg2): range bounds violation u64=[0x7fffffff, 0x7ffffffd] s64=[0x7fffffff, 0x7ffffffd] u32=[0x7fffffff, 0x7ffffffd] s32=[0x7fffffff, 0x7ffffffd] var_off=(0x7fffffff, 0x0) REG INVARIANTS VIOLATION (false_reg1): range bounds violation u64=[0x7fffffff, 0x7ffffffd] s64=[0x7fffffff, 0x7ffffffd] u32=[0x7fffffff, 0x7ffffffd] s32=[0x7fffffff, 0x7ffffffd] var_off=(0x7fffffff, 0x0) REG INVARIANTS VIOLATION (false_reg2): const tnum out of sync with range bounds u64=[0x0, 0xffffffffffffffff] s64=[0x8000000000000000, 0x7fffffffffffffff] u32=[0x0, 0xffffffff] s32=[0x80000000, 0x7fffffff] var_off=(0x7fffffff, 0x0) 19: R6_w=0x7fffffff 20: (95) exit from 19 to 21: R0=0x7fffffff R6=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x7ffffffe,var_off=(0x2; 0x7ffffffd)) R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 21: R0=0x7fffffff R6=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x7ffffffe,var_off=(0x2; 0x7ffffffd)) R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 21: (14) w6 -= 2147483632 ; R6_w=scalar(smin=umin=umin32=2,smax=umax=0xffffffff,smin32=0x80000012,smax32=14,var_off=(0x2; 0xfffffffd)) 22: (76) if w6 s>= 0xe goto pc+1 ; R6_w=scalar(smin=umin=umin32=2,smax=umax=0xffffffff,smin32=0x80000012,smax32=13,var_off=(0x2; 0xfffffffd)) 23: (95) exit from 22 to 24: R0=0x7fffffff R6_w=14 R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 24: R0=0x7fffffff R6_w=14 R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 24: (14) w6 -= 14 ; R6_w=0 [...] What can be seen here is a register invariant violation on line 19. After the binary-or in line 18, the verifier knows that bit 2 is set but knows nothing about the rest of the content which was loaded from a map value, meaning, range is [2,0x7fffffff] with var_off=(0x2; 0x7ffffffd). When in line 19 the verifier analyzes the branch, it splits the register states in reg_set_min_max() into the registers of the true branch (true_reg1, true_reg2) and the registers of the false branch (false_reg1, false_reg2). Since the test is w6 != 0x7ffffffd, the src_reg is a known constant. Internally, the verifier creates a "fake" register initialized as scalar to the value of 0x7ffffffd, and then passes it onto reg_set_min_max(). Now, for line 19, it is mathematically impossible to take the false branch of this program, yet the verifier analyzes it. It is impossible because the second bit of r6 will be set due to the prior or operation and the constant in the condition has that bit unset (hex(fd) == binary(1111 1101). When the verifier first analyzes the false / fall-through branch, it will compute an intersection between the var_off of r6 and of the constant. This is because the verifier creates a "fake" register initialized to the value of the constant. The intersection result later refines both registers in regs_refine_cond_op(): [...] t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off)); reg1->var_off = tnum_with_subreg(reg1->var_off, t); reg2->var_off = tnum_with_subreg(reg2->var_off, t); [...] Since the verifier is analyzing the false branch of the conditional jump, reg1 is equal to false_reg1 and reg2 is equal to false_reg2, i.e. the reg2 is the "fake" register that was meant to hold a constant value. The resulting var_off of the intersection says that both registers now hold a known value of var_off=(0x7fffffff, 0x0) or in other words: this operation manages to make the verifier think that the "constant" value that was passed in the jump operation now holds a different value. Normally this would not be an issue since it should not influence the true branch, however, false_reg2 and true_reg2 are pointers to the same "fake" register. Meaning, the false branch can influence the results of the true branch. In line 24, the verifier assumes R6_w=0, but the actual runtime value in this case is 1. The fix is simply not passing in the same "fake" register location as inputs to reg_set_min_max(), but instead making a copy. Moving the fake_reg into the env also reduces stack consumption by 120 bytes. With this, the verifier successfully rejects invalid accesses from the test program. [0] https://github.com/google/buzzer Fixes: 67420501e868 ("bpf: generalize reg_set_min_max() to handle non-const register comparisons") Reported-by: Juan José López Jaimez Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/r/20240613115310.25383-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ kernel/bpf/verifier.c | 14 ++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 50aa87f8d77ff..e4070fb02b110 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -746,6 +746,8 @@ struct bpf_verifier_env { /* Same as scratched_regs but for stack slots */ u64 scratched_stack_slots; u64 prev_log_pos, prev_insn_print_pos; + /* buffer used to temporary hold constants as scalar registers */ + struct bpf_reg_state fake_reg[2]; /* buffer used to generate temporary string representations, * e.g., in reg_type_str() to generate reg_type string */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 36ef8e96787ed..f455548ba46c9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15113,7 +15113,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; struct bpf_reg_state *eq_branch_regs; - struct bpf_reg_state fake_reg = {}; u8 opcode = BPF_OP(insn->code); bool is_jmp32; int pred = -1; @@ -15179,7 +15178,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } - src_reg = &fake_reg; + src_reg = &env->fake_reg[0]; + memset(src_reg, 0, sizeof(*src_reg)); src_reg->type = SCALAR_VALUE; __mark_reg_known(src_reg, insn->imm); } @@ -15239,10 +15239,16 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, &other_branch_regs[insn->src_reg], dst_reg, src_reg, opcode, is_jmp32); } else /* BPF_SRC(insn->code) == BPF_K */ { + /* reg_set_min_max() can mangle the fake_reg. Make a copy + * so that these are two different memory locations. The + * src_reg is not used beyond here in context of K. + */ + memcpy(&env->fake_reg[1], &env->fake_reg[0], + sizeof(env->fake_reg[0])); err = reg_set_min_max(env, &other_branch_regs[insn->dst_reg], - src_reg /* fake one */, - dst_reg, src_reg /* same fake one */, + &env->fake_reg[0], + dst_reg, &env->fake_reg[1], opcode, is_jmp32); } if (err) From e73cd1cfc2177654e562b04f514be5f0f0b96da2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 13 Jun 2024 13:53:09 +0200 Subject: [PATCH 074/272] bpf: Reduce stack consumption in check_stack_write_fixed_off The fake_reg moved into env->fake_reg given it consumes a lot of stack space (120 bytes). Migrate the fake_reg in check_stack_write_fixed_off() as well now that we have it. Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20240613115310.25383-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f455548ba46c9..e5a0ba3bc38d4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4549,11 +4549,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr.id = 0; } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) && env->bpf_capable) { - struct bpf_reg_state fake_reg = {}; + struct bpf_reg_state *tmp_reg = &env->fake_reg[0]; - __mark_reg_known(&fake_reg, insn->imm); - fake_reg.type = SCALAR_VALUE; - save_register_state(env, state, spi, &fake_reg, size); + memset(tmp_reg, 0, sizeof(*tmp_reg)); + __mark_reg_known(tmp_reg, insn->imm); + tmp_reg->type = SCALAR_VALUE; + save_register_state(env, state, spi, tmp_reg, size); } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { From ceb65eb60026e03e1028a99f0ec94f22065e722a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 13 Jun 2024 13:53:10 +0200 Subject: [PATCH 075/272] selftests/bpf: Add test coverage for reg_set_min_max handling Add a test case for the jmp32/k fix to ensure selftests have coverage. Before fix: # ./vmtest.sh -- ./test_progs -t verifier_or_jmp32_k [...] ./test_progs -t verifier_or_jmp32_k tester_init:PASS:tester_log_buf 0 nsec process_subtest:PASS:obj_open_mem 0 nsec process_subtest:PASS:specs_alloc 0 nsec run_subtest:PASS:obj_open_mem 0 nsec run_subtest:FAIL:unexpected_load_success unexpected success: 0 #492/1 verifier_or_jmp32_k/or_jmp32_k: bit ops + branch on unknown value:FAIL #492 verifier_or_jmp32_k:FAIL Summary: 0/0 PASSED, 0 SKIPPED, 1 FAILED After fix: # ./vmtest.sh -- ./test_progs -t verifier_or_jmp32_k [...] ./test_progs -t verifier_or_jmp32_k #492/1 verifier_or_jmp32_k/or_jmp32_k: bit ops + branch on unknown value:OK #492 verifier_or_jmp32_k:OK Summary: 1/1 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240613115310.25383-3-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_or_jmp32_k.c | 41 +++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_or_jmp32_k.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 1c9c4ec1be11e..98ef39efa77e8 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -53,6 +53,7 @@ #include "verifier_movsx.skel.h" #include "verifier_netfilter_ctx.skel.h" #include "verifier_netfilter_retcode.skel.h" +#include "verifier_or_jmp32_k.skel.h" #include "verifier_precision.skel.h" #include "verifier_prevent_map_lookup.skel.h" #include "verifier_raw_stack.skel.h" @@ -170,6 +171,7 @@ void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } void test_verifier_netfilter_ctx(void) { RUN(verifier_netfilter_ctx); } void test_verifier_netfilter_retcode(void) { RUN(verifier_netfilter_retcode); } +void test_verifier_or_jmp32_k(void) { RUN(verifier_or_jmp32_k); } void test_verifier_precision(void) { RUN(verifier_precision); } void test_verifier_prevent_map_lookup(void) { RUN(verifier_prevent_map_lookup); } void test_verifier_raw_stack(void) { RUN(verifier_raw_stack); } diff --git a/tools/testing/selftests/bpf/progs/verifier_or_jmp32_k.c b/tools/testing/selftests/bpf/progs/verifier_or_jmp32_k.c new file mode 100644 index 0000000000000..f37713a265ac7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_or_jmp32_k.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +SEC("socket") +__description("or_jmp32_k: bit ops + branch on unknown value") +__failure +__msg("R0 invalid mem access 'scalar'") +__naked void or_jmp32_k(void) +{ + asm volatile (" \ + r0 = 0xffffffff; \ + r0 /= 1; \ + r1 = 0; \ + w1 = -1; \ + w1 >>= 1; \ + w0 &= w1; \ + w0 |= 2; \ + if w0 != 0x7ffffffd goto l1; \ + r0 = 1; \ + exit; \ +l3: \ + r0 = 5; \ + *(u64*)(r0 - 8) = r0; \ + exit; \ +l2: \ + w0 -= 0xe; \ + if w0 == 1 goto l3; \ + r0 = 4; \ + exit; \ +l1: \ + w0 -= 0x7ffffff0; \ + if w0 s>= 0xe goto l2; \ + r0 = 3; \ + exit; \ +" ::: __clobber_all); +} + +char _license[] SEC("license") = "GPL"; From b99a95bc56c52a428befbce12d9451fd7a0f3bc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Thu, 13 Jun 2024 10:31:46 -0700 Subject: [PATCH 076/272] bpf: fix UML x86_64 compile failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pcpu_hot (defined in arch/x86) is not available on user mode linux (ARCH=um) Cc: Andrii Nakryiko Cc: John Fastabend Cc: Alexei Starovoitov Fixes: 1ae6921009e5 ("bpf: inline bpf_get_smp_processor_id() helper") Signed-off-by: Maciej Żenczykowski Link: https://lore.kernel.org/r/20240613173146.2524647-1-maze@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e5a0ba3bc38d4..010cfee7ffe93 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20320,7 +20320,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_UML) /* Implement bpf_get_smp_processor_id() inline. */ if (insn->imm == BPF_FUNC_get_smp_processor_id && prog->jit_requested && bpf_jit_supports_percpu_insn()) { From 9a95c5bfbf02a0a7f5983280fe284a0ff0836c34 Mon Sep 17 00:00:00 2001 From: GUO Zihua Date: Tue, 7 May 2024 01:25:41 +0000 Subject: [PATCH 077/272] ima: Avoid blocking in RCU read-side critical section A panic happens in ima_match_policy: BUG: unable to handle kernel NULL pointer dereference at 0000000000000010 PGD 42f873067 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 5 PID: 1286325 Comm: kubeletmonit.sh Kdump: loaded Tainted: P Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 RIP: 0010:ima_match_policy+0x84/0x450 Code: 49 89 fc 41 89 cf 31 ed 89 44 24 14 eb 1c 44 39 7b 18 74 26 41 83 ff 05 74 20 48 8b 1b 48 3b 1d f2 b9 f4 00 0f 84 9c 01 00 00 <44> 85 73 10 74 ea 44 8b 6b 14 41 f6 c5 01 75 d4 41 f6 c5 02 74 0f RSP: 0018:ff71570009e07a80 EFLAGS: 00010207 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000200 RDX: ffffffffad8dc7c0 RSI: 0000000024924925 RDI: ff3e27850dea2000 RBP: 0000000000000000 R08: 0000000000000000 R09: ffffffffabfce739 R10: ff3e27810cc42400 R11: 0000000000000000 R12: ff3e2781825ef970 R13: 00000000ff3e2785 R14: 000000000000000c R15: 0000000000000001 FS: 00007f5195b51740(0000) GS:ff3e278b12d40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000010 CR3: 0000000626d24002 CR4: 0000000000361ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ima_get_action+0x22/0x30 process_measurement+0xb0/0x830 ? page_add_file_rmap+0x15/0x170 ? alloc_set_pte+0x269/0x4c0 ? prep_new_page+0x81/0x140 ? simple_xattr_get+0x75/0xa0 ? selinux_file_open+0x9d/0xf0 ima_file_check+0x64/0x90 path_openat+0x571/0x1720 do_filp_open+0x9b/0x110 ? page_counter_try_charge+0x57/0xc0 ? files_cgroup_alloc_fd+0x38/0x60 ? __alloc_fd+0xd4/0x250 ? do_sys_open+0x1bd/0x250 do_sys_open+0x1bd/0x250 do_syscall_64+0x5d/0x1d0 entry_SYSCALL_64_after_hwframe+0x65/0xca Commit c7423dbdbc9e ("ima: Handle -ESTALE returned by ima_filter_rule_match()") introduced call to ima_lsm_copy_rule within a RCU read-side critical section which contains kmalloc with GFP_KERNEL. This implies a possible sleep and violates limitations of RCU read-side critical sections on non-PREEMPT systems. Sleeping within RCU read-side critical section might cause synchronize_rcu() returning early and break RCU protection, allowing a UAF to happen. The root cause of this issue could be described as follows: | Thread A | Thread B | | |ima_match_policy | | | rcu_read_lock | |ima_lsm_update_rule | | | synchronize_rcu | | | | kmalloc(GFP_KERNEL)| | | sleep | ==> synchronize_rcu returns early | kfree(entry) | | | | entry = entry->next| ==> UAF happens and entry now becomes NULL (or could be anything). | | entry->action | ==> Accessing entry might cause panic. To fix this issue, we are converting all kmalloc that is called within RCU read-side critical section to use GFP_ATOMIC. Fixes: c7423dbdbc9e ("ima: Handle -ESTALE returned by ima_filter_rule_match()") Cc: stable@vger.kernel.org Signed-off-by: GUO Zihua Acked-by: John Johansen Reviewed-by: Mimi Zohar Reviewed-by: Casey Schaufler [PM: fixed missing comment, long lines, !CONFIG_IMA_LSM_RULES case] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 +- include/linux/security.h | 5 +++-- kernel/auditfilter.c | 5 +++-- security/apparmor/audit.c | 6 +++--- security/apparmor/include/audit.h | 2 +- security/integrity/ima/ima.h | 2 +- security/integrity/ima/ima_policy.c | 15 +++++++++------ security/security.c | 6 ++++-- security/selinux/include/audit.h | 4 +++- security/selinux/ss/services.c | 5 +++-- security/smack/smack_lsm.c | 4 +++- 11 files changed, 34 insertions(+), 22 deletions(-) diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index f804b76cde44e..44488b1ab9a97 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -413,7 +413,7 @@ LSM_HOOK(void, LSM_RET_VOID, key_post_create_or_update, struct key *keyring, #ifdef CONFIG_AUDIT LSM_HOOK(int, 0, audit_rule_init, u32 field, u32 op, char *rulestr, - void **lsmrule) + void **lsmrule, gfp_t gfp) LSM_HOOK(int, 0, audit_rule_known, struct audit_krule *krule) LSM_HOOK(int, 0, audit_rule_match, u32 secid, u32 field, u32 op, void *lsmrule) LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) diff --git a/include/linux/security.h b/include/linux/security.h index 21cf70346b330..de3af33e6ff50 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2048,7 +2048,8 @@ static inline void security_key_post_create_or_update(struct key *keyring, #ifdef CONFIG_AUDIT #ifdef CONFIG_SECURITY -int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule); +int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule, + gfp_t gfp); int security_audit_rule_known(struct audit_krule *krule); int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule); void security_audit_rule_free(void *lsmrule); @@ -2056,7 +2057,7 @@ void security_audit_rule_free(void *lsmrule); #else static inline int security_audit_rule_init(u32 field, u32 op, char *rulestr, - void **lsmrule) + void **lsmrule, gfp_t gfp) { return 0; } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index be8c680121e46..d6ef4f4f9cba5 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -529,7 +529,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, entry->rule.buflen += f_val; f->lsm_str = str; err = security_audit_rule_init(f->type, f->op, str, - (void **)&f->lsm_rule); + (void **)&f->lsm_rule, + GFP_KERNEL); /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (err == -EINVAL) { @@ -799,7 +800,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df, /* our own (refreshed) copy of lsm_rule */ ret = security_audit_rule_init(df->type, df->op, df->lsm_str, - (void **)&df->lsm_rule); + (void **)&df->lsm_rule, GFP_KERNEL); /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (ret == -EINVAL) { diff --git a/security/apparmor/audit.c b/security/apparmor/audit.c index 45beb1c5f747a..6b5181c668b5b 100644 --- a/security/apparmor/audit.c +++ b/security/apparmor/audit.c @@ -217,7 +217,7 @@ void aa_audit_rule_free(void *vrule) } } -int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) +int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, gfp_t gfp) { struct aa_audit_rule *rule; @@ -230,14 +230,14 @@ int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) return -EINVAL; } - rule = kzalloc(sizeof(struct aa_audit_rule), GFP_KERNEL); + rule = kzalloc(sizeof(struct aa_audit_rule), gfp); if (!rule) return -ENOMEM; /* Currently rules are treated as coming from the root ns */ rule->label = aa_label_parse(&root_ns->unconfined->label, rulestr, - GFP_KERNEL, true, false); + gfp, true, false); if (IS_ERR(rule->label)) { int err = PTR_ERR(rule->label); aa_audit_rule_free(rule); diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h index acbb03b9bd25c..0c8cc86b417b5 100644 --- a/security/apparmor/include/audit.h +++ b/security/apparmor/include/audit.h @@ -200,7 +200,7 @@ static inline int complain_error(int error) } void aa_audit_rule_free(void *vrule); -int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule); +int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, gfp_t gfp); int aa_audit_rule_known(struct audit_krule *rule); int aa_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule); diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 3e568126cd481..c51e24d24d1e9 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -546,7 +546,7 @@ static inline void ima_free_modsig(struct modsig *modsig) #else static inline int ima_filter_rule_init(u32 field, u32 op, char *rulestr, - void **lsmrule) + void **lsmrule, gfp_t gfp) { return -EINVAL; } diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index c0556907c2e67..09da8e6392395 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -401,7 +401,8 @@ static void ima_free_rule(struct ima_rule_entry *entry) kfree(entry); } -static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry) +static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry, + gfp_t gfp) { struct ima_rule_entry *nentry; int i; @@ -410,7 +411,7 @@ static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry) * Immutable elements are copied over as pointers and data; only * lsm rules can change */ - nentry = kmemdup(entry, sizeof(*nentry), GFP_KERNEL); + nentry = kmemdup(entry, sizeof(*nentry), gfp); if (!nentry) return NULL; @@ -425,7 +426,8 @@ static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry) ima_filter_rule_init(nentry->lsm[i].type, Audit_equal, nentry->lsm[i].args_p, - &nentry->lsm[i].rule); + &nentry->lsm[i].rule, + gfp); if (!nentry->lsm[i].rule) pr_warn("rule for LSM \'%s\' is undefined\n", nentry->lsm[i].args_p); @@ -438,7 +440,7 @@ static int ima_lsm_update_rule(struct ima_rule_entry *entry) int i; struct ima_rule_entry *nentry; - nentry = ima_lsm_copy_rule(entry); + nentry = ima_lsm_copy_rule(entry, GFP_KERNEL); if (!nentry) return -ENOMEM; @@ -664,7 +666,7 @@ static bool ima_match_rules(struct ima_rule_entry *rule, } if (rc == -ESTALE && !rule_reinitialized) { - lsm_rule = ima_lsm_copy_rule(rule); + lsm_rule = ima_lsm_copy_rule(rule, GFP_ATOMIC); if (lsm_rule) { rule_reinitialized = true; goto retry; @@ -1140,7 +1142,8 @@ static int ima_lsm_rule_init(struct ima_rule_entry *entry, entry->lsm[lsm_rule].type = audit_type; result = ima_filter_rule_init(entry->lsm[lsm_rule].type, Audit_equal, entry->lsm[lsm_rule].args_p, - &entry->lsm[lsm_rule].rule); + &entry->lsm[lsm_rule].rule, + GFP_KERNEL); if (!entry->lsm[lsm_rule].rule) { pr_warn("rule for LSM \'%s\' is undefined\n", entry->lsm[lsm_rule].args_p); diff --git a/security/security.c b/security/security.c index e5da848c50b91..e5ca08789f741 100644 --- a/security/security.c +++ b/security/security.c @@ -5332,15 +5332,17 @@ void security_key_post_create_or_update(struct key *keyring, struct key *key, * @op: rule operator * @rulestr: rule context * @lsmrule: receive buffer for audit rule struct + * @gfp: GFP flag used for kmalloc * * Allocate and initialize an LSM audit rule structure. * * Return: Return 0 if @lsmrule has been successfully set, -EINVAL in case of * an invalid rule. */ -int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule) +int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule, + gfp_t gfp) { - return call_int_hook(audit_rule_init, field, op, rulestr, lsmrule); + return call_int_hook(audit_rule_init, field, op, rulestr, lsmrule, gfp); } /** diff --git a/security/selinux/include/audit.h b/security/selinux/include/audit.h index 52aca71210b47..29c7d4c86f6d5 100644 --- a/security/selinux/include/audit.h +++ b/security/selinux/include/audit.h @@ -21,12 +21,14 @@ * @op: the operator the rule uses * @rulestr: the text "target" of the rule * @rule: pointer to the new rule structure returned via this + * @gfp: GFP flag used for kmalloc * * Returns 0 if successful, -errno if not. On success, the rule structure * will be allocated internally. The caller must free this structure with * selinux_audit_rule_free() after use. */ -int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **rule); +int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **rule, + gfp_t gfp); /** * selinux_audit_rule_free - free an selinux audit rule structure. diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index f20e1968b7f7a..e33e55384b75a 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -3507,7 +3507,8 @@ void selinux_audit_rule_free(void *vrule) } } -int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) +int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, + gfp_t gfp) { struct selinux_state *state = &selinux_state; struct selinux_policy *policy; @@ -3548,7 +3549,7 @@ int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) return -EINVAL; } - tmprule = kzalloc(sizeof(struct selinux_audit_rule), GFP_KERNEL); + tmprule = kzalloc(sizeof(struct selinux_audit_rule), gfp); if (!tmprule) return -ENOMEM; context_init(&tmprule->au_ctxt); diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 70ba2841e181d..f5cbec1e6a923 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -4693,11 +4693,13 @@ static int smack_post_notification(const struct cred *w_cred, * @op: required testing operator (=, !=, >, <, ...) * @rulestr: smack label to be audited * @vrule: pointer to save our own audit rule representation + * @gfp: type of the memory for the allocation * * Prepare to audit cases where (@field @op @rulestr) is true. * The label to be audited is created if necessay. */ -static int smack_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) +static int smack_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, + gfp_t gfp) { struct smack_known *skp; char **rule = (char **)vrule; From 4eb4e85c4f818491efc67e9373aa16b123c3f522 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Fri, 7 Jun 2024 12:50:14 -0700 Subject: [PATCH 078/272] btrfs: retry block group reclaim without infinite loop If inc_block_group_ro systematically fails (e.g. due to ETXTBUSY from swap) or btrfs_relocate_chunk systematically fails (from lack of space), then this worker becomes an infinite loop. At the very least, this strands the cleaner thread, but can also result in hung tasks/RCU stalls on PREEMPT_NONE kernels and if the reclaim_bgs_lock mutex is not contended. I believe the best long term fix is to manage reclaim via work queue, where we queue up a relocation on the triggering condition and re-queue on failure. In the meantime, this is an easy fix to apply to avoid the immediate pain. Fixes: 7e2718099438 ("btrfs: reinsert BGs failed to reclaim") CC: stable@vger.kernel.org # 6.6+ Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 1e09aeea69c22..1a66be33bb048 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1785,6 +1785,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) container_of(work, struct btrfs_fs_info, reclaim_bgs_work); struct btrfs_block_group *bg; struct btrfs_space_info *space_info; + LIST_HEAD(retry_list); if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; @@ -1921,8 +1922,11 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } next: - if (ret) - btrfs_mark_bg_to_reclaim(bg); + if (ret) { + /* Refcount held by the reclaim_bgs list after splice. */ + btrfs_get_block_group(bg); + list_add_tail(&bg->bg_list, &retry_list); + } btrfs_put_block_group(bg); mutex_unlock(&fs_info->reclaim_bgs_lock); @@ -1942,6 +1946,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); end: + spin_lock(&fs_info->unused_bgs_lock); + list_splice_tail(&retry_list, &fs_info->reclaim_bgs); + spin_unlock(&fs_info->unused_bgs_lock); btrfs_exclop_finish(fs_info); sb_end_write(fs_info->sb); } From cebae292e0c32a228e8f2219c270a7237be24a6a Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 7 Jun 2024 13:27:48 +0200 Subject: [PATCH 079/272] btrfs: zoned: allocate dummy checksums for zoned NODATASUM writes Shin'ichiro reported that when he's running fstests' test-case btrfs/167 on emulated zoned devices, he's seeing the following NULL pointer dereference in 'btrfs_zone_finish_endio()': Oops: general protection fault, probably for non-canonical address 0xdffffc0000000011: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000088-0x000000000000008f] CPU: 4 PID: 2332440 Comm: kworker/u80:15 Tainted: G W 6.10.0-rc2-kts+ #4 Hardware name: Supermicro Super Server/X11SPi-TF, BIOS 3.3 02/21/2020 Workqueue: btrfs-endio-write btrfs_work_helper [btrfs] RIP: 0010:btrfs_zone_finish_endio.part.0+0x34/0x160 [btrfs] RSP: 0018:ffff88867f107a90 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff893e5534 RDX: 0000000000000011 RSI: 0000000000000004 RDI: 0000000000000088 RBP: 0000000000000002 R08: 0000000000000001 R09: ffffed1081696028 R10: ffff88840b4b0143 R11: ffff88834dfff600 R12: ffff88840b4b0000 R13: 0000000000020000 R14: 0000000000000000 R15: ffff888530ad5210 FS: 0000000000000000(0000) GS:ffff888e3f800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f87223fff38 CR3: 00000007a7c6a002 CR4: 00000000007706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? __die_body.cold+0x19/0x27 ? die_addr+0x46/0x70 ? exc_general_protection+0x14f/0x250 ? asm_exc_general_protection+0x26/0x30 ? do_raw_read_unlock+0x44/0x70 ? btrfs_zone_finish_endio.part.0+0x34/0x160 [btrfs] btrfs_finish_one_ordered+0x5d9/0x19a0 [btrfs] ? __pfx_lock_release+0x10/0x10 ? do_raw_write_lock+0x90/0x260 ? __pfx_do_raw_write_lock+0x10/0x10 ? __pfx_btrfs_finish_one_ordered+0x10/0x10 [btrfs] ? _raw_write_unlock+0x23/0x40 ? btrfs_finish_ordered_zoned+0x5a9/0x850 [btrfs] ? lock_acquire+0x435/0x500 btrfs_work_helper+0x1b1/0xa70 [btrfs] ? __schedule+0x10a8/0x60b0 ? __pfx___might_resched+0x10/0x10 process_one_work+0x862/0x1410 ? __pfx_lock_acquire+0x10/0x10 ? __pfx_process_one_work+0x10/0x10 ? assign_work+0x16c/0x240 worker_thread+0x5e6/0x1010 ? __pfx_worker_thread+0x10/0x10 kthread+0x2c3/0x3a0 ? trace_irq_enable.constprop.0+0xce/0x110 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x31/0x70 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Enabling CONFIG_BTRFS_ASSERT revealed the following assertion to trigger: assertion failed: !list_empty(&ordered->list), in fs/btrfs/zoned.c:1815 This indicates, that we're missing the checksums list on the ordered_extent. As btrfs/167 is doing a NOCOW write this is to be expected. Further analysis with drgn confirmed the assumption: >>> inode = prog.crashed_thread().stack_trace()[11]['ordered'].inode >>> btrfs_inode = drgn.container_of(inode, "struct btrfs_inode", \ "vfs_inode") >>> print(btrfs_inode.flags) (u32)1 As zoned emulation mode simulates conventional zones on regular devices, we cannot use zone-append for writing. But we're only attaching dummy checksums if we're doing a zone-append write. So for NOCOW zoned data writes on conventional zones, also attach a dummy checksum. Reported-by: Shinichiro Kawasaki Fixes: cbfce4c7fbde ("btrfs: optimize the logical to physical mapping for zoned writes") CC: Naohiro Aota # 6.6+ Tested-by: Shin'ichiro Kawasaki Reviewed-by: Naohiro Aota Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 477f350a8bd09..e3a57196b0ee0 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -741,7 +741,9 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) ret = btrfs_bio_csum(bbio); if (ret) goto fail_put_bio; - } else if (use_append) { + } else if (use_append || + (btrfs_is_zoned(fs_info) && inode && + inode->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_alloc_dummy_sum(bbio); if (ret) goto fail_put_bio; From 4467c09bc7a66a17ffd84d6262d48279b26106ea Mon Sep 17 00:00:00 2001 From: Aryan Srivastava Date: Thu, 13 Jun 2024 14:49:00 +1200 Subject: [PATCH 080/272] net: mvpp2: use slab_build_skb for oversized frames Setting frag_size to 0 to indicate kmalloc has been deprecated, use slab_build_skb directly. Fixes: ce098da1497c ("skbuff: Introduce slab_build_skb()") Signed-off-by: Aryan Srivastava Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20240613024900.3842238-1-aryan.srivastava@alliedtelesis.co.nz Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index e91486c48de38..671368d2c77e6 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -4014,7 +4014,10 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, } } - skb = build_skb(data, frag_size); + if (frag_size) + skb = build_skb(data, frag_size); + else + skb = slab_build_skb(data); if (!skb) { netdev_warn(port->dev, "skb build failed\n"); goto err_drop_frame; From 135c6eb27a85c8b261a2cc1f5093abcda6ee9010 Mon Sep 17 00:00:00 2001 From: Joel Slebodnick Date: Thu, 13 Jun 2024 14:27:28 -0400 Subject: [PATCH 081/272] scsi: ufs: core: Free memory allocated for model before reinit Under the conditions that a device is to be reinitialized within ufshcd_probe_hba(), the device must first be fully reset. Resetting the device should include freeing U8 model (member of dev_info) but does not, and this causes a memory leak. ufs_put_device_desc() is responsible for freeing model. unreferenced object 0xffff3f63008bee60 (size 32): comm "kworker/u33:1", pid 60, jiffies 4294892642 hex dump (first 32 bytes): 54 48 47 4a 46 47 54 30 54 32 35 42 41 5a 5a 41 THGJFGT0T25BAZZA 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc ed7ff1a9): [] kmemleak_alloc+0x34/0x40 [] __kmalloc_noprof+0x1e4/0x2fc [] ufshcd_read_string_desc+0x94/0x190 [] ufshcd_device_init+0x480/0xdf8 [] ufshcd_probe_hba+0x3c/0x404 [] ufshcd_async_scan+0x40/0x370 [] async_run_entry_fn+0x34/0xe0 [] process_one_work+0x154/0x298 [] worker_thread+0x2f8/0x408 [] kthread+0x114/0x118 [] ret_from_fork+0x10/0x20 Fixes: 96a7141da332 ("scsi: ufs: core: Add support for reinitializing the UFS device") Cc: Reviewed-by: Andrew Halaney Reviewed-by: Bart Van Assche Signed-off-by: Joel Slebodnick Link: https://lore.kernel.org/r/20240613200202.2524194-1-jslebodn@redhat.com Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index e5e9da61f15d0..1b65e6ae41375 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -8787,6 +8787,7 @@ static int ufshcd_probe_hba(struct ufs_hba *hba, bool init_dev_params) (hba->quirks & UFSHCD_QUIRK_REINIT_AFTER_MAX_GEAR_SWITCH)) { /* Reset the device and controller before doing reinit */ ufshcd_device_reset(hba); + ufs_put_device_desc(hba); ufshcd_hba_stop(hba); ufshcd_vops_reinit_notify(hba); ret = ufshcd_hba_enable(hba); From 633aeefafc9c2a07a76a62be6aac1d73c3e3defa Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 13 Jun 2024 14:18:26 -0700 Subject: [PATCH 082/272] scsi: core: Introduce the BLIST_SKIP_IO_HINTS flag Prepare for skipping the IO Advice Hints Grouping mode page for USB storage devices. Cc: Alan Stern Cc: Joao Machado Cc: Andy Shevchenko Cc: Christian Heusel Cc: stable@vger.kernel.org Fixes: 4f53138fffc2 ("scsi: sd: Translate data lifetime information") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240613211828.2077477-2-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 4 ++++ include/scsi/scsi_devinfo.h | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index fbc11046bbf60..fe82baa924f81 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include #include @@ -3118,6 +3119,9 @@ static void sd_read_io_hints(struct scsi_disk *sdkp, unsigned char *buffer) struct scsi_mode_data data; int res; + if (sdp->sdev_bflags & BLIST_SKIP_IO_HINTS) + return; + res = scsi_mode_sense(sdp, /*dbd=*/0x8, /*modepage=*/0x0a, /*subpage=*/0x05, buffer, SD_BUF_SIZE, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr); diff --git a/include/scsi/scsi_devinfo.h b/include/scsi/scsi_devinfo.h index 6b548dc2c4965..1d79a3b536cee 100644 --- a/include/scsi/scsi_devinfo.h +++ b/include/scsi/scsi_devinfo.h @@ -69,8 +69,10 @@ #define BLIST_RETRY_ITF ((__force blist_flags_t)(1ULL << 32)) /* Always retry ABORTED_COMMAND with ASC 0xc1 */ #define BLIST_RETRY_ASC_C1 ((__force blist_flags_t)(1ULL << 33)) +/* Do not query the IO Advice Hints Grouping mode page */ +#define BLIST_SKIP_IO_HINTS ((__force blist_flags_t)(1ULL << 34)) -#define __BLIST_LAST_USED BLIST_RETRY_ASC_C1 +#define __BLIST_LAST_USED BLIST_SKIP_IO_HINTS #define __BLIST_HIGH_UNUSED (~(__BLIST_LAST_USED | \ (__force blist_flags_t) \ From 57619f3cdeb5ae9f4252833b0ed600e9f81da722 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 13 Jun 2024 14:18:27 -0700 Subject: [PATCH 083/272] scsi: usb: uas: Do not query the IO Advice Hints Grouping mode page for USB/UAS devices Recently it was reported that the following USB storage devices are unusable with Linux kernel 6.9: * Kingston DataTraveler G2 * Garmin FR35 This is because attempting to read the IO Advice Hints Grouping mode page causes these devices to reset. Hence do not read the IO Advice Hints Grouping mode page from USB/UAS storage devices. Acked-by: Alan Stern Cc: stable@vger.kernel.org Fixes: 4f53138fffc2 ("scsi: sd: Translate data lifetime information") Reported-by: Joao Machado Closes: https://lore.kernel.org/linux-scsi/20240130214911.1863909-1-bvanassche@acm.org/T/#mf4e3410d8f210454d7e4c3d1fb5c0f41e651b85f Tested-by: Andy Shevchenko Bisected-by: Christian Heusel Reported-by: Andy Shevchenko Closes: https://lore.kernel.org/linux-scsi/CACLx9VdpUanftfPo2jVAqXdcWe8Y43MsDeZmMPooTzVaVJAh2w@mail.gmail.com/ Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240613211828.2077477-3-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/usb/storage/scsiglue.c | 6 ++++++ drivers/usb/storage/uas.c | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c index b31464740f6c8..8c8b5e6041cc2 100644 --- a/drivers/usb/storage/scsiglue.c +++ b/drivers/usb/storage/scsiglue.c @@ -79,6 +79,12 @@ static int slave_alloc (struct scsi_device *sdev) if (us->protocol == USB_PR_BULK && us->max_lun > 0) sdev->sdev_bflags |= BLIST_FORCELUN; + /* + * Some USB storage devices reset if the IO advice hints grouping mode + * page is queried. Hence skip that mode page. + */ + sdev->sdev_bflags |= BLIST_SKIP_IO_HINTS; + return 0; } diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c index a48870a87a293..b610a2de4ae5d 100644 --- a/drivers/usb/storage/uas.c +++ b/drivers/usb/storage/uas.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -820,6 +821,12 @@ static int uas_slave_alloc(struct scsi_device *sdev) struct uas_dev_info *devinfo = (struct uas_dev_info *)sdev->host->hostdata; + /* + * Some USB storage devices reset if the IO advice hints grouping mode + * page is queried. Hence skip that mode page. + */ + sdev->sdev_bflags |= BLIST_SKIP_IO_HINTS; + sdev->hostdata = devinfo; return 0; } From 2663d0462eb32ae7c9b035300ab6b1523886c718 Mon Sep 17 00:00:00 2001 From: Kenton Groombridge Date: Wed, 5 Jun 2024 11:22:18 -0400 Subject: [PATCH 084/272] wifi: mac80211: Avoid address calculations via out of bounds array indexing req->n_channels must be set before req->channels[] can be used. This patch fixes one of the issues encountered in [1]. [ 83.964255] UBSAN: array-index-out-of-bounds in net/mac80211/scan.c:364:4 [ 83.964258] index 0 is out of range for type 'struct ieee80211_channel *[]' [...] [ 83.964264] Call Trace: [ 83.964267] [ 83.964269] dump_stack_lvl+0x3f/0xc0 [ 83.964274] __ubsan_handle_out_of_bounds+0xec/0x110 [ 83.964278] ieee80211_prep_hw_scan+0x2db/0x4b0 [ 83.964281] __ieee80211_start_scan+0x601/0x990 [ 83.964291] nl80211_trigger_scan+0x874/0x980 [ 83.964295] genl_family_rcv_msg_doit+0xe8/0x160 [ 83.964298] genl_rcv_msg+0x240/0x270 [...] [1] https://bugzilla.kernel.org/show_bug.cgi?id=218810 Co-authored-by: Kees Cook Signed-off-by: Kees Cook Signed-off-by: Kenton Groombridge Link: https://msgid.link/20240605152218.236061-1-concord@gentoo.org Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 8ecc4b710b0e6..b5f2df61c7f67 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -358,7 +358,8 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) struct cfg80211_scan_request *req; struct cfg80211_chan_def chandef; u8 bands_used = 0; - int i, ielen, n_chans; + int i, ielen; + u32 *n_chans; u32 flags = 0; req = rcu_dereference_protected(local->scan_req, @@ -368,34 +369,34 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) return false; if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { + local->hw_scan_req->req.n_channels = req->n_channels; + for (i = 0; i < req->n_channels; i++) { local->hw_scan_req->req.channels[i] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); } - - n_chans = req->n_channels; } else { do { if (local->hw_scan_band == NUM_NL80211_BANDS) return false; - n_chans = 0; + n_chans = &local->hw_scan_req->req.n_channels; + *n_chans = 0; for (i = 0; i < req->n_channels; i++) { if (req->channels[i]->band != local->hw_scan_band) continue; - local->hw_scan_req->req.channels[n_chans] = + local->hw_scan_req->req.channels[(*n_chans)++] = req->channels[i]; - n_chans++; + bands_used |= BIT(req->channels[i]->band); } local->hw_scan_band++; - } while (!n_chans); + } while (!*n_chans); } - local->hw_scan_req->req.n_channels = n_chans; ieee80211_prepare_scan_chandef(&chandef); if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) From 0d9c2beed116e623ac30810d382bd67163650f98 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 12 Jun 2024 12:23:51 +0200 Subject: [PATCH 085/272] wifi: mac80211: fix monitor channel with chanctx emulation After the channel context emulation, there were reports that changing the monitor channel no longer works. This is because those drivers don't have WANT_MONITOR_VIF, so the setting the channel always exits out quickly. Fix this by always allocating the virtual monitor sdata, and simply not telling the driver about it unless it wanted to. This way, we have an interface/sdata to bind the chanctx to, and the emulation can work correctly. Cc: stable@vger.kernel.org Fixes: 0a44dfc07074 ("wifi: mac80211: simplify non-chanctx drivers") Reported-and-tested-by: Savyasaachi Vanga Closes: https://lore.kernel.org/r/chwoymvpzwtbmzryrlitpwmta5j6mtndocxsyqvdyikqu63lon@gfds653hkknl Link: https://msgid.link/20240612122351.b12d4a109dde.I1831a44417faaab92bea1071209abbe4efbe3fba@changeid Signed-off-by: Johannes Berg --- net/mac80211/driver-ops.c | 17 +++++++++++++++++ net/mac80211/iface.c | 21 +++++++++------------ net/mac80211/util.c | 2 +- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index dce37ba8ebe37..254d745832cbf 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -311,6 +311,18 @@ int drv_assign_vif_chanctx(struct ieee80211_local *local, might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); + /* + * We should perhaps push emulate chanctx down and only + * make it call ->config() when the chanctx is actually + * assigned here (and unassigned below), but that's yet + * another change to all drivers to add assign/unassign + * emulation callbacks. Maybe later. + */ + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && + local->emulate_chanctx && + !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + return 0; + if (!check_sdata_in_driver(sdata)) return -EIO; @@ -338,6 +350,11 @@ void drv_unassign_vif_chanctx(struct ieee80211_local *local, might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && + local->emulate_chanctx && + !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + return; + if (!check_sdata_in_driver(sdata)) return; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 0c54554bf761b..b935bb5d8ed1f 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1122,9 +1122,6 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) struct ieee80211_sub_if_data *sdata; int ret; - if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) - return 0; - ASSERT_RTNL(); lockdep_assert_wiphy(local->hw.wiphy); @@ -1146,11 +1143,13 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) ieee80211_set_default_queues(sdata); - ret = drv_add_interface(local, sdata); - if (WARN_ON(ret)) { - /* ok .. stupid driver, it asked for this! */ - kfree(sdata); - return ret; + if (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { + ret = drv_add_interface(local, sdata); + if (WARN_ON(ret)) { + /* ok .. stupid driver, it asked for this! */ + kfree(sdata); + return ret; + } } set_bit(SDATA_STATE_RUNNING, &sdata->state); @@ -1188,9 +1187,6 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; - if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) - return; - ASSERT_RTNL(); lockdep_assert_wiphy(local->hw.wiphy); @@ -1210,7 +1206,8 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local) ieee80211_link_release_channel(&sdata->deflink); - drv_remove_interface(local, sdata); + if (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + drv_remove_interface(local, sdata); kfree(sdata); } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 283bfc99417e5..963ed75deb765 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1843,7 +1843,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) /* add interfaces */ sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); - if (sdata) { + if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { /* in HW restart it exists already */ WARN_ON(local->resuming); res = drv_add_interface(local, sdata); From 9f36169912331fa035d7b73a91252d7c2512eb1a Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Fri, 7 Jun 2024 18:07:52 +0200 Subject: [PATCH 086/272] cipso: fix total option length computation As evident from the definition of ip_options_get(), the IP option IPOPT_END is used to pad the IP option data array, not IPOPT_NOP. Yet the loop that walks the IP options to determine the total IP options length in cipso_v4_delopt() doesn't take IPOPT_END into account. Fix it by recognizing the IPOPT_END value as the end of actual options. Fixes: 014ab19a69c3 ("selinux: Set socket NetLabel based on connection endpoint") Signed-off-by: Ondrej Mosnacek Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index dd6d460150580..5e9ac68444f89 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -2013,12 +2013,16 @@ static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr) * from there we can determine the new total option length */ iter = 0; optlen_new = 0; - while (iter < opt->opt.optlen) - if (opt->opt.__data[iter] != IPOPT_NOP) { + while (iter < opt->opt.optlen) { + if (opt->opt.__data[iter] == IPOPT_END) { + break; + } else if (opt->opt.__data[iter] == IPOPT_NOP) { + iter++; + } else { iter += opt->opt.__data[iter + 1]; optlen_new = iter; - } else - iter++; + } + } hdr_delta = opt->opt.optlen; opt->opt.optlen = (optlen_new + 3) & ~3; hdr_delta -= opt->opt.optlen; From 89aa3619d141d6cfb6040a561aebb6d99d3e2285 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Fri, 7 Jun 2024 18:07:53 +0200 Subject: [PATCH 087/272] cipso: make cipso_v4_skbuff_delattr() fully remove the CIPSO options As the comment in this function says, the code currently just clears the CIPSO part with IPOPT_NOP, rather than removing it completely and trimming the packet. The other cipso_v4_*_delattr() functions, however, do the proper removal and also calipso_skbuff_delattr() makes an effort to remove the CALIPSO options instead of replacing them with padding. Some routers treat IPv4 packets with anything (even NOPs) in the option header as a special case and take them through a slower processing path. Consequently, hardening guides such as STIG recommend to configure such routers to drop packets with non-empty IP option headers [1][2]. Thus, users might expect NetLabel to produce packets with minimal padding (or at least with no padding when no actual options are present). Implement the proper option removal to address this and to be closer to what the peer functions do. [1] https://www.stigviewer.com/stig/juniper_router_rtr/2019-09-27/finding/V-90937 [2] https://www.stigviewer.com/stig/cisco_ios_xe_router_rtr/2021-03-26/finding/V-217001 Signed-off-by: Ondrej Mosnacek Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 79 +++++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 5e9ac68444f89..e9cb27061c12e 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1810,6 +1810,29 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len, return CIPSO_V4_HDR_LEN + ret_val; } +static int cipso_v4_get_actual_opt_len(const unsigned char *data, int len) +{ + int iter = 0, optlen = 0; + + /* determining the new total option length is tricky because of + * the padding necessary, the only thing i can think to do at + * this point is walk the options one-by-one, skipping the + * padding at the end to determine the actual option size and + * from there we can determine the new total option length + */ + while (iter < len) { + if (data[iter] == IPOPT_END) { + break; + } else if (data[iter] == IPOPT_NOP) { + iter++; + } else { + iter += data[iter + 1]; + optlen = iter; + } + } + return optlen; +} + /** * cipso_v4_sock_setattr - Add a CIPSO option to a socket * @sk: the socket @@ -1986,7 +2009,6 @@ static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr) u8 cipso_len; u8 cipso_off; unsigned char *cipso_ptr; - int iter; int optlen_new; cipso_off = opt->opt.cipso - sizeof(struct iphdr); @@ -2006,23 +2028,8 @@ static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr) memmove(cipso_ptr, cipso_ptr + cipso_len, opt->opt.optlen - cipso_off - cipso_len); - /* determining the new total option length is tricky because of - * the padding necessary, the only thing i can think to do at - * this point is walk the options one-by-one, skipping the - * padding at the end to determine the actual option size and - * from there we can determine the new total option length */ - iter = 0; - optlen_new = 0; - while (iter < opt->opt.optlen) { - if (opt->opt.__data[iter] == IPOPT_END) { - break; - } else if (opt->opt.__data[iter] == IPOPT_NOP) { - iter++; - } else { - iter += opt->opt.__data[iter + 1]; - optlen_new = iter; - } - } + optlen_new = cipso_v4_get_actual_opt_len(opt->opt.__data, + opt->opt.optlen); hdr_delta = opt->opt.optlen; opt->opt.optlen = (optlen_new + 3) & ~3; hdr_delta -= opt->opt.optlen; @@ -2242,7 +2249,8 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb, */ int cipso_v4_skbuff_delattr(struct sk_buff *skb) { - int ret_val; + int ret_val, cipso_len, hdr_len_actual, new_hdr_len_actual, new_hdr_len, + hdr_len_delta; struct iphdr *iph; struct ip_options *opt = &IPCB(skb)->opt; unsigned char *cipso_ptr; @@ -2255,16 +2263,37 @@ int cipso_v4_skbuff_delattr(struct sk_buff *skb) if (ret_val < 0) return ret_val; - /* the easiest thing to do is just replace the cipso option with noop - * options since we don't change the size of the packet, although we - * still need to recalculate the checksum */ - iph = ip_hdr(skb); cipso_ptr = (unsigned char *)iph + opt->cipso; - memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]); + cipso_len = cipso_ptr[1]; + + hdr_len_actual = sizeof(struct iphdr) + + cipso_v4_get_actual_opt_len((unsigned char *)(iph + 1), + opt->optlen); + new_hdr_len_actual = hdr_len_actual - cipso_len; + new_hdr_len = (new_hdr_len_actual + 3) & ~3; + hdr_len_delta = (iph->ihl << 2) - new_hdr_len; + + /* 1. shift any options after CIPSO to the left */ + memmove(cipso_ptr, cipso_ptr + cipso_len, + new_hdr_len_actual - opt->cipso); + /* 2. move the whole IP header to its new place */ + memmove((unsigned char *)iph + hdr_len_delta, iph, new_hdr_len_actual); + /* 3. adjust the skb layout */ + skb_pull(skb, hdr_len_delta); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + /* 4. re-fill new padding with IPOPT_END (may now be longer) */ + memset((unsigned char *)iph + new_hdr_len_actual, IPOPT_END, + new_hdr_len - new_hdr_len_actual); + + opt->optlen -= hdr_len_delta; opt->cipso = 0; opt->is_changed = 1; - + if (hdr_len_delta != 0) { + iph->ihl = new_hdr_len >> 2; + iph_set_totlen(iph, skb->len); + } ip_send_check(iph); return 0; From 68f860426d500cfb697b505799244c7dfff604b1 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 14 Jun 2024 00:31:04 +0100 Subject: [PATCH 088/272] mfd: axp20x: AXP717: Fix missing IRQ status registers range While we list the "IRQ status *and acknowledge*" registers as volatile in the MFD description, they are missing from the writable range array, so acknowledging any interrupts was met with an -EIO error. This error propagates up, leading to the whole AXP717 driver failing to probe, which is fatal to most systems using this PMIC, since most peripherals refer one of the PMIC voltage rails. This wasn't noticed on the initial submission, since the interrupt was completely missing at this point, but the DTs now merged describe the interrupt, creating the problem. Add the five registers that hold those bits to the writable array. This fixes the boot on the Anbernic systems using the AXP717 PMIC. Fixes: b5bfc8ab2484 ("mfd: axp20x: Add support for AXP717 PMIC") Reported-by: Chris Morgan Signed-off-by: Andre Przywara Reviewed-by: John Watts Link: https://lore.kernel.org/r/20240613233104.17529-1-andre.przywara@arm.com Signed-off-by: Lee Jones --- drivers/mfd/axp20x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c index f2c0f144c0fc3..dacd3c96c9f57 100644 --- a/drivers/mfd/axp20x.c +++ b/drivers/mfd/axp20x.c @@ -210,6 +210,7 @@ static const struct regmap_access_table axp313a_volatile_table = { static const struct regmap_range axp717_writeable_ranges[] = { regmap_reg_range(AXP717_IRQ0_EN, AXP717_IRQ4_EN), + regmap_reg_range(AXP717_IRQ0_STATE, AXP717_IRQ4_STATE), regmap_reg_range(AXP717_DCDC_OUTPUT_CONTROL, AXP717_CPUSLDO_CONTROL), }; From 004b8d1491b4bcbb7da1a3206d1e7e66822d47c6 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 14 Jun 2024 09:55:58 +0200 Subject: [PATCH 089/272] ovl: fix encoding fid for lower only root ovl_check_encode_origin() should return a positive number if the lower dentry is to be encoded, zero otherwise. If there's no upper layer at all (read-only overlay), then it obviously needs to return positive. This was broken by commit 16aac5ad1fa9 ("ovl: support encoding non-decodable file handles"), which didn't take the lower-only configuration into account. Fix by checking the no-upper-layer case up-front. Reported-and-tested-by: Youzhong Yang Closes: https://lore.kernel.org/all/CADpNCvaBimi+zCYfRJHvCOhMih8OU0rmZkwLuh24MKKroRuT8Q@mail.gmail.com/ Fixes: 16aac5ad1fa9 ("ovl: support encoding non-decodable file handles") Cc: # v6.6 Signed-off-by: Miklos Szeredi --- fs/overlayfs/export.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 063409069f56d..5868cb2229552 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -181,6 +181,10 @@ static int ovl_check_encode_origin(struct dentry *dentry) struct ovl_fs *ofs = OVL_FS(dentry->d_sb); bool decodable = ofs->config.nfs_export; + /* No upper layer? */ + if (!ovl_upper_mnt(ofs)) + return 1; + /* Lower file handle for non-upper non-decodable */ if (!ovl_dentry_upper(dentry) && !decodable) return 1; @@ -209,7 +213,7 @@ static int ovl_check_encode_origin(struct dentry *dentry) * ovl_connect_layer() will try to make origin's layer "connected" by * copying up a "connectable" ancestor. */ - if (d_is_dir(dentry) && ovl_upper_mnt(ofs) && decodable) + if (d_is_dir(dentry) && decodable) return ovl_connect_layer(dentry); /* Lower file handle for indexed and non-upper dir/non-dir */ From 721f2e6653f5ab0cc52b3a459c4a2158b92fcf80 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Thu, 13 Jun 2024 14:37:11 +0100 Subject: [PATCH 090/272] ALSA: hda: cs35l56: Component should be unbound before deconstruction The interface associated with the hda_component should be deactivated before the driver is deconstructed during removal. Fixes: 73cfbfa9caea ("ALSA: hda/cs35l56: Add driver for Cirrus Logic CS35L56 amplifier") Signed-off-by: Simon Trimmer Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240613133713.75550-2-simont@opensource.cirrus.com --- sound/pci/hda/cs35l56_hda.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/cs35l56_hda.c b/sound/pci/hda/cs35l56_hda.c index 0923e2589f5f7..e134ede6c5aa5 100644 --- a/sound/pci/hda/cs35l56_hda.c +++ b/sound/pci/hda/cs35l56_hda.c @@ -1077,12 +1077,12 @@ void cs35l56_hda_remove(struct device *dev) { struct cs35l56_hda *cs35l56 = dev_get_drvdata(dev); + component_del(cs35l56->base.dev, &cs35l56_hda_comp_ops); + pm_runtime_dont_use_autosuspend(cs35l56->base.dev); pm_runtime_get_sync(cs35l56->base.dev); pm_runtime_disable(cs35l56->base.dev); - component_del(cs35l56->base.dev, &cs35l56_hda_comp_ops); - cs_dsp_remove(&cs35l56->cs_dsp); kfree(cs35l56->system_name); From 6f9a40d61cad0f5560e8530b4dd4a05fc4d15987 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Thu, 13 Jun 2024 14:37:12 +0100 Subject: [PATCH 091/272] ALSA: hda: cs35l41: Component should be unbound before deconstruction The interface associated with the hda_component should be deactivated before the driver is deconstructed during removal. Fixes: 7b2f3eb492da ("ALSA: hda: cs35l41: Add support for CS35L41 in HDA systems") Signed-off-by: Simon Trimmer Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240613133713.75550-3-simont@opensource.cirrus.com --- sound/pci/hda/cs35l41_hda.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index d54d4d60b03ec..031703f010be5 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -2019,6 +2019,8 @@ void cs35l41_hda_remove(struct device *dev) { struct cs35l41_hda *cs35l41 = dev_get_drvdata(dev); + component_del(cs35l41->dev, &cs35l41_hda_comp_ops); + pm_runtime_get_sync(cs35l41->dev); pm_runtime_dont_use_autosuspend(cs35l41->dev); pm_runtime_disable(cs35l41->dev); @@ -2026,8 +2028,6 @@ void cs35l41_hda_remove(struct device *dev) if (cs35l41->halo_initialized) cs35l41_remove_dsp(cs35l41); - component_del(cs35l41->dev, &cs35l41_hda_comp_ops); - acpi_dev_put(cs35l41->dacpi); pm_runtime_put_noidle(cs35l41->dev); From d832b5a03e94a2a9f866dab3d04937a0f84ea116 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Thu, 13 Jun 2024 14:37:13 +0100 Subject: [PATCH 092/272] ALSA: hda: tas2781: Component should be unbound before deconstruction The interface associated with the hda_component should be deactivated before the driver is deconstructed during removal. Fixes: 4e7914eb1dae ("ALSA: hda/tas2781: remove sound controls in unbind") Signed-off-by: Simon Trimmer Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240613133713.75550-4-simont@opensource.cirrus.com --- sound/pci/hda/tas2781_hda_i2c.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/tas2781_hda_i2c.c b/sound/pci/hda/tas2781_hda_i2c.c index 75f7674c66ee7..fdee6592c502d 100644 --- a/sound/pci/hda/tas2781_hda_i2c.c +++ b/sound/pci/hda/tas2781_hda_i2c.c @@ -777,11 +777,11 @@ static void tas2781_hda_remove(struct device *dev) { struct tas2781_hda *tas_hda = dev_get_drvdata(dev); + component_del(tas_hda->dev, &tas2781_hda_comp_ops); + pm_runtime_get_sync(tas_hda->dev); pm_runtime_disable(tas_hda->dev); - component_del(tas_hda->dev, &tas2781_hda_comp_ops); - pm_runtime_put_noidle(tas_hda->dev); tasdevice_remove(tas_hda->priv); From 2bbe3e5a2f4ef69d13be54f1cf895b4658287080 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 14 Jun 2024 12:17:33 +0200 Subject: [PATCH 093/272] bpf: Avoid splat in pskb_pull_reason syzkaller builds (CONFIG_DEBUG_NET=y) frequently trigger a debug hint in pskb_may_pull. We'd like to retain this debug check because it might hint at integer overflows and other issues (kernel code should pull headers, not huge value). In bpf case, this splat isn't interesting at all: such (nonsensical) bpf programs are typically generated by a fuzzer anyway. Do what Eric suggested and suppress such warning. For CONFIG_DEBUG_NET=n we don't need the extra check because pskb_may_pull will do the right thing: return an error without the WARN() backtrace. Fixes: 219eee9c0d16 ("net: skbuff: add overflow debug check to pull/push helpers") Reported-by: syzbot+0c4150bff9fff3bf023c@syzkaller.appspotmail.com Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Reviewed-by: Eric Dumazet Acked-by: Daniel Borkmann Closes: https://syzkaller.appspot.com/bug?extid=0c4150bff9fff3bf023c Link: https://lore.kernel.org/netdev/9f254c96-54f2-4457-b7ab-1d9f6187939c@gmail.com/ Link: https://lore.kernel.org/bpf/20240614101801.9496-1-fw@strlen.de --- net/core/filter.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 2510464692af0..9933851c685e7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1665,6 +1665,11 @@ static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); static inline int __bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { +#ifdef CONFIG_DEBUG_NET + /* Avoid a splat in pskb_may_pull_reason() */ + if (write_len > INT_MAX) + return -EINVAL; +#endif return skb_ensure_writable(skb, write_len); } From d2278f3533a8c4933c52f85784ffa73e8250c524 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 14 Jun 2024 17:22:25 +0200 Subject: [PATCH 094/272] thermal: core: Synchronize suspend-prepare and post-suspend actions After commit 5a5efdaffda5 ("thermal: core: Resume thermal zones asynchronously") it is theoretically possible that, if a system suspend starts immediately after a system resume, thermal_zone_device_resume() spawned by the thermal PM notifier for one of the thermal zones at the end of the system resume will run after the PM thermal notifier for the suspend-prepare action. If that happens, tz->suspended set by the latter will be reset by the former which may lead to unexpected consequences. To avoid that race, synchronize thermal_zone_device_resume() with the suspend-prepare thermal PM notifier with the help of additional bool field and completion in struct thermal_zone_device. Note that this also ensures running __thermal_zone_device_update() at least once for each thermal zone between system resume and the following system suspend in case it is needed to start thermal mitigation. Fixes: 5a5efdaffda5 ("thermal: core: Resume thermal zones asynchronously") Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_core.c | 21 +++++++++++++++++++++ drivers/thermal/thermal_core.h | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 30567b4994551..f92529fb0d10e 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1397,6 +1397,7 @@ thermal_zone_device_register_with_trips(const char *type, ida_init(&tz->ida); mutex_init(&tz->lock); init_completion(&tz->removal); + init_completion(&tz->resume); id = ida_alloc(&thermal_tz_ida, GFP_KERNEL); if (id < 0) { result = id; @@ -1642,6 +1643,9 @@ static void thermal_zone_device_resume(struct work_struct *work) thermal_zone_device_init(tz); __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); + complete(&tz->resume); + tz->resuming = false; + mutex_unlock(&tz->lock); } @@ -1659,6 +1663,20 @@ static int thermal_pm_notify(struct notifier_block *nb, list_for_each_entry(tz, &thermal_tz_list, node) { mutex_lock(&tz->lock); + if (tz->resuming) { + /* + * thermal_zone_device_resume() queued up for + * this zone has not acquired the lock yet, so + * release it to let the function run and wait + * util it has done the work. + */ + mutex_unlock(&tz->lock); + + wait_for_completion(&tz->resume); + + mutex_lock(&tz->lock); + } + tz->suspended = true; mutex_unlock(&tz->lock); @@ -1676,6 +1694,9 @@ static int thermal_pm_notify(struct notifier_block *nb, cancel_delayed_work(&tz->poll_queue); + reinit_completion(&tz->resume); + tz->resuming = true; + /* * Replace the work function with the resume one, which * will restore the original work function and schedule diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 20e7b45673d68..66f67e54e0c8d 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -55,6 +55,7 @@ struct thermal_governor { * @type: the thermal zone device type * @device: &struct device for this thermal zone * @removal: removal completion + * @resume: resume completion * @trip_temp_attrs: attributes for trip points for sysfs: trip temperature * @trip_type_attrs: attributes for trip points for sysfs: trip type * @trip_hyst_attrs: attributes for trip points for sysfs: trip hysteresis @@ -89,6 +90,7 @@ struct thermal_governor { * @poll_queue: delayed work for polling * @notify_event: Last notification event * @suspended: thermal zone suspend indicator + * @resuming: indicates whether or not thermal zone resume is in progress * @trips: array of struct thermal_trip objects */ struct thermal_zone_device { @@ -96,6 +98,7 @@ struct thermal_zone_device { char type[THERMAL_NAME_LENGTH]; struct device device; struct completion removal; + struct completion resume; struct attribute_group trips_attribute_group; struct thermal_attr *trip_temp_attrs; struct thermal_attr *trip_type_attrs; @@ -123,6 +126,7 @@ struct thermal_zone_device { struct delayed_work poll_queue; enum thermal_notify_event notify_event; bool suspended; + bool resuming; #ifdef CONFIG_THERMAL_DEBUGFS struct thermal_debugfs *debugfs; #endif From 494c7d055081da066424706b28faa9a4c719d852 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 14 Jun 2024 17:26:00 +0200 Subject: [PATCH 095/272] thermal: core: Change PM notifier priority to the minimum It is reported that commit 5a5efdaffda5 ("thermal: core: Resume thermal zones asynchronously") causes battery data in sysfs on Thinkpad P1 Gen2 to become invalid after a resume from S3 (and it is necessary to reboot the machine to restore correct battery data). Some investigation into the problem indicated that it happened because, after the commit in question, the ACPI battery PM notifier ran in parallel with thermal_zone_device_resume() for one of the thermal zones which apparently confused the platform firmware on the affected system. While the exact reason for the firmware confusion remains unclear, it is arguably not particularly relevant, and the expected behavior of the affected system can be restored by making the thermal PM notifier run at the lowest priority which avoids interference between work items spawned by it and the other PM notifiers (that will run before those work items now). Fixes: 5a5efdaffda5 ("thermal: core: Resume thermal zones asynchronously") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218881 Reported-by: fhortner@yahoo.de Tested-by: fhortner@yahoo.de Cc: 6.8+ # 6.8+ Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index f92529fb0d10e..f7c38c0d6199b 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1721,6 +1721,12 @@ static int thermal_pm_notify(struct notifier_block *nb, static struct notifier_block thermal_pm_nb = { .notifier_call = thermal_pm_notify, + /* + * Run at the lowest priority to avoid interference between the thermal + * zone resume work items spawned by thermal_pm_notify() and the other + * PM notifiers. + */ + .priority = INT_MIN, }; static int __init thermal_init(void) From 0a5d3258d7c97295a89d22e54733b54aacb62562 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 3 Jun 2024 22:23:15 -0700 Subject: [PATCH 096/272] compiler_types.h: Define __retain for __attribute__((__retain__)) Some code includes the __used macro to prevent functions and data from being optimized out. This macro implements __attribute__((__used__)), which operates at the compiler and IR-level, and so still allows a linker to remove objects intended to be kept. Compilers supporting __attribute__((__retain__)) can address this gap by setting the flag SHF_GNU_RETAIN on the section of a function/variable, indicating to the linker the object should be retained. This attribute is available since gcc 11, clang 13, and binutils 2.36. Provide a __retain macro implementing __attribute__((__retain__)), whose first user will be the '__bpf_kfunc' tag. [ Additional remark from discussion: Why is CONFIG_LTO_CLANG added here? The __used macro permits garbage collection at section level, so CLANG_LTO_CLANG without CONFIG_LD_DEAD_CODE_DATA_ELIMINATION should not change final section dynamics? The conditional guard was included to ensure consistent behaviour between __retain and other features forcing split sections. In particular, the same guard is used in vmlinux.lds.h to merge split sections where needed. For example, using __retain in LLVM builds without CONFIG_LTO was failing CI tests on kernel-patches/bpf because the kernel didn't boot properly. And in further testing, the kernel had no issues loading BPF kfunc modules with such split sections, so the module (partial) linking scripts were left alone. ] Signed-off-by: Tony Ambardar Signed-off-by: Daniel Borkmann Cc: Yonghong Song Link: https://lore.kernel.org/bpf/ZlmGoT9KiYLZd91S@krava/T/ Link: https://lore.kernel.org/bpf/b31bca5a5e6765a0f32cc8c19b1d9cdbfaa822b5.1717477560.git.Tony.Ambardar@gmail.com --- include/linux/compiler_types.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 93600de3800bf..f14c275950b5b 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -143,6 +143,29 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __preserve_most #endif +/* + * Annotating a function/variable with __retain tells the compiler to place + * the object in its own section and set the flag SHF_GNU_RETAIN. This flag + * instructs the linker to retain the object during garbage-cleanup or LTO + * phases. + * + * Note that the __used macro is also used to prevent functions or data + * being optimized out, but operates at the compiler/IR-level and may still + * allow unintended removal of objects during linking. + * + * Optional: only supported since gcc >= 11, clang >= 13 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-retain-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#retain + */ +#if __has_attribute(__retain__) && \ + (defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || \ + defined(CONFIG_LTO_CLANG)) +# define __retain __attribute__((__retain__)) +#else +# define __retain +#endif + /* Compiler specific macros. */ #ifdef __clang__ #include From 7bdcedd5c8fb88e7176b93812b139eca5fe0aa46 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 3 Jun 2024 22:23:16 -0700 Subject: [PATCH 097/272] bpf: Harden __bpf_kfunc tag against linker kfunc removal BPF kfuncs are often not directly referenced and may be inadvertently removed by optimization steps during kernel builds, thus the __bpf_kfunc tag mitigates against this removal by including the __used macro. However, this macro alone does not prevent removal during linking, and may still yield build warnings (e.g. on mips64el): [...] LD vmlinux BTFIDS vmlinux WARN: resolve_btfids: unresolved symbol bpf_verify_pkcs7_signature WARN: resolve_btfids: unresolved symbol bpf_lookup_user_key WARN: resolve_btfids: unresolved symbol bpf_lookup_system_key WARN: resolve_btfids: unresolved symbol bpf_key_put WARN: resolve_btfids: unresolved symbol bpf_iter_task_next WARN: resolve_btfids: unresolved symbol bpf_iter_css_task_new WARN: resolve_btfids: unresolved symbol bpf_get_file_xattr WARN: resolve_btfids: unresolved symbol bpf_ct_insert_entry WARN: resolve_btfids: unresolved symbol bpf_cgroup_release WARN: resolve_btfids: unresolved symbol bpf_cgroup_from_id WARN: resolve_btfids: unresolved symbol bpf_cgroup_acquire WARN: resolve_btfids: unresolved symbol bpf_arena_free_pages NM System.map SORTTAB vmlinux OBJCOPY vmlinux.32 [...] Update the __bpf_kfunc tag to better guard against linker optimization by including the new __retain compiler macro, which fixes the warnings above. Verify the __retain macro with readelf by checking object flags for 'R': $ readelf -Wa kernel/trace/bpf_trace.o Section Headers: [Nr] Name Type Address Off Size ES Flg Lk Inf Al [...] [178] .text.bpf_key_put PROGBITS 00000000 6420 0050 00 AXR 0 0 8 [...] Key to Flags: [...] R (retain), D (mbind), p (processor specific) Fixes: 57e7c169cd6a ("bpf: Add __bpf_kfunc tag for marking kernel functions as kfuncs") Reported-by: kernel test robot Signed-off-by: Tony Ambardar Signed-off-by: Daniel Borkmann Tested-by: Jiri Olsa Reviewed-by: Jiri Olsa Cc: Yonghong Song Closes: https://lore.kernel.org/r/202401211357.OCX9yllM-lkp@intel.com/ Link: https://lore.kernel.org/bpf/ZlmGoT9KiYLZd91S@krava/T/ Link: https://lore.kernel.org/bpf/e9c64e9b5c073dabd457ff45128aabcab7630098.1717477560.git.Tony.Ambardar@gmail.com --- include/linux/btf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index f9e56fd12a9fd..7c3e40c3295ef 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -82,7 +82,7 @@ * as to avoid issues such as the compiler inlining or eliding either a static * kfunc, or a global kfunc in an LTO build. */ -#define __bpf_kfunc __used noinline +#define __bpf_kfunc __used __retain noinline #define __bpf_kfunc_start_defs() \ __diag_push(); \ From 7ed352d34f1a09a7659c53de07785115587499fe Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 13 Jun 2024 14:30:44 -0700 Subject: [PATCH 098/272] netdev-genl: fix error codes when outputting XDP features -EINVAL will interrupt the dump. The correct error to return if we have more data to dump is -EMSGSIZE. Discovered by doing: for i in `seq 80`; do ip link add type veth; done ./cli.py --dbg-small-recv 5300 --spec netdev.yaml --dump dev-get >> /dev/null [...] nl_len = 64 (48) nl_flags = 0x0 nl_type = 19 nl_len = 20 (4) nl_flags = 0x2 nl_type = 3 error: -22 Fixes: d3d854fd6a1d ("netdev-genl: create a simple family for netdev stuff") Reviewed-by: Amritha Nambiar Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240613213044.3675745-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 1f6ae6379e0fc..05f9515d2c05c 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -59,22 +59,22 @@ XDP_METADATA_KFUNC_xxx nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, xdp_rx_meta, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, - xsk_features, NETDEV_A_DEV_PAD)) { - genlmsg_cancel(rsp, hdr); - return -EINVAL; - } + xsk_features, NETDEV_A_DEV_PAD)) + goto err_cancel_msg; if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, - netdev->xdp_zc_max_segs)) { - genlmsg_cancel(rsp, hdr); - return -EINVAL; - } + netdev->xdp_zc_max_segs)) + goto err_cancel_msg; } genlmsg_end(rsp, hdr); return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; } static void From c03984d43a9dd9282da54ccf275419f666029452 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sat, 15 Jun 2024 16:00:43 +0800 Subject: [PATCH 099/272] arm64: dts: imx8mp: Fix TC9595 input clock on DH i.MX8M Plus DHCOM SoM The IMX8MP_CLK_CLKOUT2 supplies the TC9595 bridge with 13 MHz reference clock. The IMX8MP_CLK_CLKOUT2 is supplied from IMX8MP_AUDIO_PLL2_OUT. The IMX8MP_CLK_CLKOUT2 operates only as a power-of-two divider, and the current 156 MHz is not power-of-two divisible to achieve 13 MHz. To achieve 13 MHz output from IMX8MP_CLK_CLKOUT2, set IMX8MP_AUDIO_PLL2_OUT to 208 MHz, because 208 MHz / 16 = 13 MHz. Fixes: 20d0b83e712b ("arm64: dts: imx8mp: Add TC9595 bridge on DH electronics i.MX8M Plus DHCOM") Signed-off-by: Marek Vasut Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi index 43f1d45ccc96f..f5115f9e8c473 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi @@ -254,7 +254,7 @@ <&clk IMX8MP_CLK_CLKOUT2>, <&clk IMX8MP_AUDIO_PLL2_OUT>; assigned-clock-parents = <&clk IMX8MP_AUDIO_PLL2_OUT>; - assigned-clock-rates = <13000000>, <13000000>, <156000000>; + assigned-clock-rates = <13000000>, <13000000>, <208000000>; reset-gpios = <&gpio4 1 GPIO_ACTIVE_HIGH>; status = "disabled"; From bcdea3e81ea51c9e89e3b11aac2612e1b4330bee Mon Sep 17 00:00:00 2001 From: Liu Ying Date: Tue, 14 May 2024 11:07:18 +0800 Subject: [PATCH 100/272] arm: dts: imx53-qsb-hdmi: Disable panel instead of deleting node We cannot use /delete-node/ directive to delete a node in a DT overlay. The node won't be deleted effectively. Instead, set the node's status property to "disabled" to achieve something similar. Fixes: eeb403df953f ("ARM: dts: imx53-qsb: add support for the HDMI expander") Signed-off-by: Liu Ying Reviewed-by: Dmitry Baryshkov Signed-off-by: Shawn Guo --- arch/arm/boot/dts/nxp/imx/imx53-qsb-common.dtsi | 2 +- arch/arm/boot/dts/nxp/imx/imx53-qsb-hdmi.dtso | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm/boot/dts/nxp/imx/imx53-qsb-common.dtsi b/arch/arm/boot/dts/nxp/imx/imx53-qsb-common.dtsi index d804404464737..05d7a462ea25a 100644 --- a/arch/arm/boot/dts/nxp/imx/imx53-qsb-common.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx53-qsb-common.dtsi @@ -85,7 +85,7 @@ }; }; - panel { + panel_dpi: panel { compatible = "sii,43wvf1g"; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_display_power>; diff --git a/arch/arm/boot/dts/nxp/imx/imx53-qsb-hdmi.dtso b/arch/arm/boot/dts/nxp/imx/imx53-qsb-hdmi.dtso index c84e9b0525276..151e9cee3c87e 100644 --- a/arch/arm/boot/dts/nxp/imx/imx53-qsb-hdmi.dtso +++ b/arch/arm/boot/dts/nxp/imx/imx53-qsb-hdmi.dtso @@ -10,8 +10,6 @@ /plugin/; &{/} { - /delete-node/ panel; - hdmi: connector-hdmi { compatible = "hdmi-connector"; label = "hdmi"; @@ -82,6 +80,10 @@ }; }; +&panel_dpi { + status = "disabled"; +}; + &tve { status = "disabled"; }; From a1439d89480754ddbc0a837544129ff5100f4087 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 10 Jun 2024 14:12:45 +0200 Subject: [PATCH 101/272] efi/arm: Disable LPAE PAN when calling EFI runtime services EFI runtime services are remapped into the lower 1 GiB of virtual address space at boot, so they are guaranteed to be able to co-exist with the kernel virtual mappings without the need to allocate space for them in the kernel's vmalloc region, which is rather small. This means those mappings are covered by TTBR0 when LPAE PAN is enabled, and so 'user' access must be enabled while such calls are in progress. Reviewed-by: Linus Walleij Signed-off-by: Ard Biesheuvel --- arch/arm/include/asm/efi.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index 78282ced50387..e408399d5f0e6 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_EFI void efi_init(void); @@ -25,6 +26,18 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md, boo #define arch_efi_call_virt_setup() efi_virtmap_load() #define arch_efi_call_virt_teardown() efi_virtmap_unload() +#ifdef CONFIG_CPU_TTBR0_PAN +#undef arch_efi_call_virt +#define arch_efi_call_virt(p, f, args...) ({ \ + unsigned int flags = uaccess_save_and_enable(); \ + efi_status_t res = _Generic((p)->f(args), \ + efi_status_t: (p)->f(args), \ + default: ((p)->f(args), EFI_ABORTED)); \ + uaccess_restore(flags); \ + res; \ +}) +#endif + #define ARCH_EFI_IRQ_FLAGS_MASK \ (PSR_J_BIT | PSR_E_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | \ PSR_T_BIT | MODE_MASK) From 75dde792d6f6c2d0af50278bd374bf0c512fe196 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 10 Jun 2024 16:02:13 +0200 Subject: [PATCH 102/272] efi/x86: Free EFI memory map only when installing a new one. The logic in __efi_memmap_init() is shared between two different execution flows: - mapping the EFI memory map early or late into the kernel VA space, so that its entries can be accessed; - the x86 specific cloning of the EFI memory map in order to insert new entries that are created as a result of making a memory reservation via a call to efi_mem_reserve(). In the former case, the underlying memory containing the kernel's view of the EFI memory map (which may be heavily modified by the kernel itself on x86) is not modified at all, and the only thing that changes is the virtual mapping of this memory, which is different between early and late boot. In the latter case, an entirely new allocation is created that carries a new, updated version of the kernel's view of the EFI memory map. When installing this new version, the old version will no longer be referenced, and if the memory was allocated by the kernel, it will leak unless it gets freed. The logic that implements this freeing currently lives on the code path that is shared between these two use cases, but it should only apply to the latter. So move it to the correct spot. While at it, drop the dummy definition for non-x86 architectures, as that is no longer needed. Cc: Fixes: f0ef6523475f ("efi: Fix efi_memmap_alloc() leaks") Tested-by: Ashish Kalra Link: https://lore.kernel.org/all/36ad5079-4326-45ed-85f6-928ff76483d3@amd.com Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 1 - arch/x86/platform/efi/memmap.c | 12 +++++++++++- drivers/firmware/efi/memmap.c | 9 --------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 1dc600fa3ba53..481096177500e 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -401,7 +401,6 @@ extern int __init efi_memmap_alloc(unsigned int num_entries, struct efi_memory_map_data *data); extern void __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags); -#define __efi_memmap_free __efi_memmap_free extern int __init efi_memmap_install(struct efi_memory_map_data *data); extern int __init efi_memmap_split_count(efi_memory_desc_t *md, diff --git a/arch/x86/platform/efi/memmap.c b/arch/x86/platform/efi/memmap.c index 4ef20b49eb5e7..6ed1935504b96 100644 --- a/arch/x86/platform/efi/memmap.c +++ b/arch/x86/platform/efi/memmap.c @@ -92,12 +92,22 @@ int __init efi_memmap_alloc(unsigned int num_entries, */ int __init efi_memmap_install(struct efi_memory_map_data *data) { + unsigned long size = efi.memmap.desc_size * efi.memmap.nr_map; + unsigned long flags = efi.memmap.flags; + u64 phys = efi.memmap.phys_map; + int ret; + efi_memmap_unmap(); if (efi_enabled(EFI_PARAVIRT)) return 0; - return __efi_memmap_init(data); + ret = __efi_memmap_init(data); + if (ret) + return ret; + + __efi_memmap_free(phys, size, flags); + return 0; } /** diff --git a/drivers/firmware/efi/memmap.c b/drivers/firmware/efi/memmap.c index 3365944f79654..34109fd86c55d 100644 --- a/drivers/firmware/efi/memmap.c +++ b/drivers/firmware/efi/memmap.c @@ -15,10 +15,6 @@ #include #include -#ifndef __efi_memmap_free -#define __efi_memmap_free(phys, size, flags) do { } while (0) -#endif - /** * __efi_memmap_init - Common code for mapping the EFI memory map * @data: EFI memory map data @@ -51,11 +47,6 @@ int __init __efi_memmap_init(struct efi_memory_map_data *data) return -ENOMEM; } - if (efi.memmap.flags & (EFI_MEMMAP_MEMBLOCK | EFI_MEMMAP_SLAB)) - __efi_memmap_free(efi.memmap.phys_map, - efi.memmap.desc_size * efi.memmap.nr_map, - efi.memmap.flags); - map.phys_map = data->phys_map; map.nr_map = data->size / data->desc_size; map.map_end = map.map + data->size; From 46e27b9961d8712bc89234444ede314cec0e8bae Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 13 Jun 2024 12:20:31 -0400 Subject: [PATCH 103/272] efi/arm64: Fix kmemleak false positive in arm64_efi_rt_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kmemleak code sometimes complains about the following leak: unreferenced object 0xffff8000102e0000 (size 32768):   comm "swapper/0", pid 1, jiffies 4294937323 (age 71.240s)   hex dump (first 32 bytes):     00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................     00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................   backtrace:     [<00000000db9a88a3>] __vmalloc_node_range+0x324/0x450     [<00000000ff8903a4>] __vmalloc_node+0x90/0xd0     [<000000001a06634f>] arm64_efi_rt_init+0x64/0xdc     [<0000000007826a8d>] do_one_initcall+0x178/0xac0     [<0000000054a87017>] do_initcalls+0x190/0x1d0     [<00000000308092d0>] kernel_init_freeable+0x2c0/0x2f0     [<000000003e7b99e0>] kernel_init+0x28/0x14c     [<000000002246af5b>] ret_from_fork+0x10/0x20 The memory object in this case is for efi_rt_stack_top and is allocated in an initcall. So this is certainly a false positive. Mark the object as not a leak to quash it. Signed-off-by: Waiman Long Signed-off-by: Ard Biesheuvel --- arch/arm64/kernel/efi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 4a92096db34e0..712718aed5dd9 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -213,6 +214,7 @@ l: if (!p) { return -ENOMEM; } + kmemleak_not_leak(p); efi_rt_stack_top = p + THREAD_SIZE; return 0; } From e1b4622efbe7ad09c9a902365a993f68c270c453 Mon Sep 17 00:00:00 2001 From: Tim Harvey Date: Wed, 22 May 2024 14:38:28 -0700 Subject: [PATCH 104/272] arm64: dts: freescale: imx8mp-venice-gw73xx-2x: fix BT shutdown GPIO Fix the invalid BT shutdown GPIO (gpio1_io3 not gpio4_io16) Fixes: 716ced308234 ("arm64: dts: freescale: Add imx8mp-venice-gw73xx-2x") Signed-off-by: Tim Harvey Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi index dec57fad68285..e2b5e7ac3e465 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi @@ -219,7 +219,7 @@ bluetooth { compatible = "brcm,bcm4330-bt"; - shutdown-gpios = <&gpio4 16 GPIO_ACTIVE_HIGH>; + shutdown-gpios = <&gpio1 3 GPIO_ACTIVE_HIGH>; }; }; From 58f7e1e2c9e72c7974054c64c3abeac81c11f822 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 30 May 2024 19:06:29 +0800 Subject: [PATCH 105/272] ocfs2: fix NULL pointer dereference in ocfs2_journal_dirty() bdev->bd_super has been removed and commit 8887b94d9322 change the usage from bdev->bd_super to b_assoc_map->host->i_sb. This introduces the following NULL pointer dereference in ocfs2_journal_dirty() since b_assoc_map is still not initialized. This can be easily reproduced by running xfstests generic/186, which simulate no more credits. [ 134.351592] BUG: kernel NULL pointer dereference, address: 0000000000000000 ... [ 134.355341] RIP: 0010:ocfs2_journal_dirty+0x14f/0x160 [ocfs2] ... [ 134.365071] Call Trace: [ 134.365312] [ 134.365524] ? __die_body+0x1e/0x60 [ 134.365868] ? page_fault_oops+0x13d/0x4f0 [ 134.366265] ? __pfx_bit_wait_io+0x10/0x10 [ 134.366659] ? schedule+0x27/0xb0 [ 134.366981] ? exc_page_fault+0x6a/0x140 [ 134.367356] ? asm_exc_page_fault+0x26/0x30 [ 134.367762] ? ocfs2_journal_dirty+0x14f/0x160 [ocfs2] [ 134.368305] ? ocfs2_journal_dirty+0x13d/0x160 [ocfs2] [ 134.368837] ocfs2_create_new_meta_bhs.isra.51+0x139/0x2e0 [ocfs2] [ 134.369454] ocfs2_grow_tree+0x688/0x8a0 [ocfs2] [ 134.369927] ocfs2_split_and_insert.isra.67+0x35c/0x4a0 [ocfs2] [ 134.370521] ocfs2_split_extent+0x314/0x4d0 [ocfs2] [ 134.371019] ocfs2_change_extent_flag+0x174/0x410 [ocfs2] [ 134.371566] ocfs2_add_refcount_flag+0x3fa/0x630 [ocfs2] [ 134.372117] ocfs2_reflink_remap_extent+0x21b/0x4c0 [ocfs2] [ 134.372994] ? inode_update_timestamps+0x4a/0x120 [ 134.373692] ? __pfx_ocfs2_journal_access_di+0x10/0x10 [ocfs2] [ 134.374545] ? __pfx_ocfs2_journal_access_di+0x10/0x10 [ocfs2] [ 134.375393] ocfs2_reflink_remap_blocks+0xe4/0x4e0 [ocfs2] [ 134.376197] ocfs2_remap_file_range+0x1de/0x390 [ocfs2] [ 134.376971] ? security_file_permission+0x29/0x50 [ 134.377644] vfs_clone_file_range+0xfe/0x320 [ 134.378268] ioctl_file_clone+0x45/0xa0 [ 134.378853] do_vfs_ioctl+0x457/0x990 [ 134.379422] __x64_sys_ioctl+0x6e/0xd0 [ 134.379987] do_syscall_64+0x5d/0x170 [ 134.380550] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 134.381231] RIP: 0033:0x7fa4926397cb [ 134.381786] Code: 73 01 c3 48 8b 0d bd 56 38 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 8d 56 38 00 f7 d8 64 89 01 48 [ 134.383930] RSP: 002b:00007ffc2b39f7b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 134.384854] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007fa4926397cb [ 134.385734] RDX: 00007ffc2b39f7f0 RSI: 000000004020940d RDI: 0000000000000003 [ 134.386606] RBP: 0000000000000000 R08: 00111a82a4f015bb R09: 00007fa494221000 [ 134.387476] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [ 134.388342] R13: 0000000000f10000 R14: 0000558e844e2ac8 R15: 0000000000f10000 [ 134.389207] Fix it by only aborting transaction and journal in ocfs2_journal_dirty() now, and leave ocfs2_abort() later when detecting an aborted handle, e.g. start next transaction. Also log the handle details in this case. Link: https://lkml.kernel.org/r/20240530110630.3933832-1-joseph.qi@linux.alibaba.com Fixes: 8887b94d9322 ("ocfs2: stop using bdev->bd_super for journal error logging") Signed-off-by: Joseph Qi Reviewed-by: Heming Zhao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: [6.6+] Signed-off-by: Andrew Morton --- fs/ocfs2/journal.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 604fea3a26ff0..27c7683c7d3fa 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -778,13 +778,15 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) if (!is_handle_aborted(handle)) { journal_t *journal = handle->h_transaction->t_journal; - mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. " - "Aborting transaction and journal.\n"); + mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed: " + "handle type %u started at line %u, credits %u/%u " + "errcode %d. Aborting transaction and journal.\n", + handle->h_type, handle->h_line_no, + handle->h_requested_credits, + jbd2_handle_buffer_credits(handle), status); handle->h_err = status; jbd2_journal_abort_handle(handle); jbd2_journal_abort(journal, status); - ocfs2_abort(bh->b_assoc_map->host->i_sb, - "Journal already aborted.\n"); } } } From 685d03c3795378fca6a1b3d43581f7f1a3fc095f Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 30 May 2024 19:06:30 +0800 Subject: [PATCH 106/272] ocfs2: fix NULL pointer dereference in ocfs2_abort_trigger() bdev->bd_super has been removed and commit 8887b94d9322 change the usage from bdev->bd_super to b_assoc_map->host->i_sb. Since ocfs2 hasn't set bh->b_assoc_map, it will trigger NULL pointer dereference when calling into ocfs2_abort_trigger(). Actually this was pointed out in history, see commit 74e364ad1b13. But I've made a mistake when reviewing commit 8887b94d9322 and then re-introduce this regression. Since we cannot revive bdev in buffer head, so fix this issue by initializing all types of ocfs2 triggers when fill super, and then get the specific ocfs2 trigger from ocfs2_caching_info when access journal. [joseph.qi@linux.alibaba.com: v2] Link: https://lkml.kernel.org/r/20240602112045.1112708-1-joseph.qi@linux.alibaba.com Link: https://lkml.kernel.org/r/20240530110630.3933832-2-joseph.qi@linux.alibaba.com Fixes: 8887b94d9322 ("ocfs2: stop using bdev->bd_super for journal error logging") Signed-off-by: Joseph Qi Reviewed-by: Heming Zhao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: [6.6+] Signed-off-by: Andrew Morton --- fs/ocfs2/journal.c | 182 +++++++++++++++++++++++++-------------------- fs/ocfs2/ocfs2.h | 27 +++++++ fs/ocfs2/super.c | 4 +- 3 files changed, 131 insertions(+), 82 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 27c7683c7d3fa..86807086b2dfd 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -479,12 +479,6 @@ int ocfs2_allocate_extend_trans(handle_t *handle, int thresh) return status; } - -struct ocfs2_triggers { - struct jbd2_buffer_trigger_type ot_triggers; - int ot_offset; -}; - static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers) { return container_of(triggers, struct ocfs2_triggers, ot_triggers); @@ -548,85 +542,76 @@ static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh) { + struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers); + mlog(ML_ERROR, "ocfs2_abort_trigger called by JBD2. bh = 0x%lx, " "bh->b_blocknr = %llu\n", (unsigned long)bh, (unsigned long long)bh->b_blocknr); - ocfs2_error(bh->b_assoc_map->host->i_sb, + ocfs2_error(ot->sb, "JBD2 has aborted our journal, ocfs2 cannot continue\n"); } -static struct ocfs2_triggers di_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_dinode, i_check), -}; - -static struct ocfs2_triggers eb_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_extent_block, h_check), -}; - -static struct ocfs2_triggers rb_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), -}; - -static struct ocfs2_triggers gd_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_group_desc, bg_check), -}; - -static struct ocfs2_triggers db_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_db_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, -}; +static void ocfs2_setup_csum_triggers(struct super_block *sb, + enum ocfs2_journal_trigger_type type, + struct ocfs2_triggers *ot) +{ + BUG_ON(type >= OCFS2_JOURNAL_TRIGGER_COUNT); -static struct ocfs2_triggers xb_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check), -}; + switch (type) { + case OCFS2_JTR_DI: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_dinode, i_check); + break; + case OCFS2_JTR_EB: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_extent_block, h_check); + break; + case OCFS2_JTR_RB: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_refcount_block, rf_check); + break; + case OCFS2_JTR_GD: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_group_desc, bg_check); + break; + case OCFS2_JTR_DB: + ot->ot_triggers.t_frozen = ocfs2_db_frozen_trigger; + break; + case OCFS2_JTR_XB: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_xattr_block, xb_check); + break; + case OCFS2_JTR_DQ: + ot->ot_triggers.t_frozen = ocfs2_dq_frozen_trigger; + break; + case OCFS2_JTR_DR: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check); + break; + case OCFS2_JTR_DL: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check); + break; + case OCFS2_JTR_NONE: + /* To make compiler happy... */ + return; + } -static struct ocfs2_triggers dq_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_dq_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, -}; + ot->ot_triggers.t_abort = ocfs2_abort_trigger; + ot->sb = sb; +} -static struct ocfs2_triggers dr_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check), -}; +void ocfs2_initialize_journal_triggers(struct super_block *sb, + struct ocfs2_triggers triggers[]) +{ + enum ocfs2_journal_trigger_type type; -static struct ocfs2_triggers dl_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check), -}; + for (type = OCFS2_JTR_DI; type < OCFS2_JOURNAL_TRIGGER_COUNT; type++) + ocfs2_setup_csum_triggers(sb, type, &triggers[type]); +} static int __ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, @@ -708,56 +693,91 @@ static int __ocfs2_journal_access(handle_t *handle, int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DI], + type); } int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_EB], + type); } int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &rb_triggers, + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_RB], type); } int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_GD], + type); } int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DB], + type); } int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_XB], + type); } int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DQ], + type); } int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DR], + type); } int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DL], + type); } int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index a503c553bab21..8fe826143d7bf 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -284,6 +284,30 @@ enum ocfs2_mount_options #define OCFS2_OSB_ERROR_FS 0x0004 #define OCFS2_DEFAULT_ATIME_QUANTUM 60 +struct ocfs2_triggers { + struct jbd2_buffer_trigger_type ot_triggers; + int ot_offset; + struct super_block *sb; +}; + +enum ocfs2_journal_trigger_type { + OCFS2_JTR_DI, + OCFS2_JTR_EB, + OCFS2_JTR_RB, + OCFS2_JTR_GD, + OCFS2_JTR_DB, + OCFS2_JTR_XB, + OCFS2_JTR_DQ, + OCFS2_JTR_DR, + OCFS2_JTR_DL, + OCFS2_JTR_NONE /* This must be the last entry */ +}; + +#define OCFS2_JOURNAL_TRIGGER_COUNT OCFS2_JTR_NONE + +void ocfs2_initialize_journal_triggers(struct super_block *sb, + struct ocfs2_triggers triggers[]); + struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; @@ -351,6 +375,9 @@ struct ocfs2_super struct ocfs2_journal *journal; unsigned long osb_commit_interval; + /* Journal triggers for checksum */ + struct ocfs2_triggers s_journal_triggers[OCFS2_JOURNAL_TRIGGER_COUNT]; + struct delayed_work la_enable_wq; /* diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 8aabaed2c1cb9..afee70125ae3b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1075,9 +1075,11 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root, osb, &ocfs2_osb_debug_fops); - if (ocfs2_meta_ecc(osb)) + if (ocfs2_meta_ecc(osb)) { + ocfs2_initialize_journal_triggers(sb, osb->s_journal_triggers); ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats, osb->osb_debug_root); + } status = ocfs2_mount_volume(sb); if (status < 0) From 8e5bd4eadd01ea0c47f3a1c798815849f813a700 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 22 May 2024 15:58:30 -0700 Subject: [PATCH 107/272] gcc: disable '-Warray-bounds' for gcc-9 '-Warray-bounds' is already disabled for gcc-10+. Now that we've merged bitmap_{read,write), I see the following error when building the kernel with gcc-9.4 (Ubuntu 20.04.4 LTS) for x86_64 allmodconfig: drivers/pinctrl/pinctrl-cy8c95x0.c: In function `cy8c95x0_read_regs_mask.isra.0': include/linux/bitmap.h:756:18: error: array subscript [1, 288230376151711744] is outside array bounds of `long unsigned int[1]' [-Werror=array-bounds] 756 | value_high = map[index + 1] & BITMAP_LAST_WORD_MASK(start + nbits); | ~~~^~~~~~~~~~~ The immediate reason is that the commit b44759705f7d ("bitmap: make bitmap_{get,set}_value8() use bitmap_{read,write}()") switched the bitmap_get_value8() to an alias of bitmap_read(); the same for 'set'. Now; the code that triggers Warray-bounds, calls the function like this: #define MAX_BANK 8 #define BANK_SZ 8 #define MAX_LINE (MAX_BANK * BANK_SZ) DECLARE_BITMAP(tval, MAX_LINE); // 64-bit map: unsigned long tval[1] read_val |= bitmap_get_value8(tval, i * BANK_SZ) & ~bits; bitmap_read() is implemented such that it may conditionally dereference a pointer beyond the boundary like this: unsigned long offset = start % BITS_PER_LONG; unsigned long space = BITS_PER_LONG - offset; if (space >= nbits) return (map[index] >> offset) & BITMAP_LAST_WORD_MASK(nbits); value_low = map[index] & BITMAP_FIRST_WORD_MASK(start); value_high = map[index + 1] & BITMAP_LAST_WORD_MASK(start + nbits); return (value_low >> offset) | (value_high << space); In case of bitmap_get_value8(), it's impossible to violate the boundary because 'space >= nbits' is never the true for byte-aligned 8-bit access. So, this is clearly a false-positive. The same type of false-positives break my allmodconfig build in many places. gcc-8, is clear, however. Link: https://lkml.kernel.org/r/20240522225830.1201778-1-yury.norov@gmail.com Fixes: b44759705f7d ("bitmap: make bitmap_{get,set}_value8() use bitmap_{read,write}()") Signed-off-by: Yury Norov Cc: Alexander Lobakin Cc: David S. Miller Cc: Gustavo A. R. Silva Cc: Masahiro Yamada Cc: Nhat Pham Cc: Petr Mladek Cc: Randy Dunlap Cc: Vincent Guittot Cc: Yoann Congal Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- init/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index 72404c1f21577..febdea2afc3be 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -883,7 +883,7 @@ config GCC10_NO_ARRAY_BOUNDS config CC_NO_ARRAY_BOUNDS bool - default y if CC_IS_GCC && GCC_VERSION >= 100000 && GCC10_NO_ARRAY_BOUNDS + default y if CC_IS_GCC && GCC_VERSION >= 90000 && GCC10_NO_ARRAY_BOUNDS # Currently, disable -Wstringop-overflow for GCC globally. config GCC_NO_STRINGOP_OVERFLOW From 8bb592c2eca8fd2bc06db7d80b38da18da4a2f43 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 5 Jun 2024 17:21:46 -0400 Subject: [PATCH 108/272] mm/page_table_check: fix crash on ZONE_DEVICE Not all pages may apply to pgtable check. One example is ZONE_DEVICE pages: they map PFNs directly, and they don't allocate page_ext at all even if there's struct page around. One may reference devm_memremap_pages(). When both ZONE_DEVICE and page-table-check enabled, then try to map some dax memories, one can trigger kernel bug constantly now when the kernel was trying to inject some pfn maps on the dax device: kernel BUG at mm/page_table_check.c:55! While it's pretty legal to use set_pxx_at() for ZONE_DEVICE pages for page fault resolutions, skip all the checks if page_ext doesn't even exist in pgtable checker, which applies to ZONE_DEVICE but maybe more. Link: https://lkml.kernel.org/r/20240605212146.994486-1-peterx@redhat.com Fixes: df4e817b7108 ("mm: page table check") Signed-off-by: Peter Xu Reviewed-by: Pasha Tatashin Reviewed-by: Dan Williams Reviewed-by: Alistair Popple Cc: Signed-off-by: Andrew Morton --- mm/page_table_check.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 4169576bed729..509c6ef8de400 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -73,6 +73,9 @@ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt) page = pfn_to_page(pfn); page_ext = page_ext_get(page); + if (!page_ext) + return; + BUG_ON(PageSlab(page)); anon = PageAnon(page); @@ -110,6 +113,9 @@ static void page_table_check_set(unsigned long pfn, unsigned long pgcnt, page = pfn_to_page(pfn); page_ext = page_ext_get(page); + if (!page_ext) + return; + BUG_ON(PageSlab(page)); anon = PageAnon(page); @@ -140,7 +146,10 @@ void __page_table_check_zero(struct page *page, unsigned int order) BUG_ON(PageSlab(page)); page_ext = page_ext_get(page); - BUG_ON(!page_ext); + + if (!page_ext) + return; + for (i = 0; i < (1ul << order); i++) { struct page_table_check *ptc = get_page_table_check(page_ext); From 384a746bb55960aa5ffb3a67de08f11fc2f51042 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 5 Jun 2024 11:17:10 +0200 Subject: [PATCH 109/272] Revert "mm: init_mlocked_on_free_v3" There was insufficient review and no agreement that this is the right approach. There are serious flaws with the implementation that make processes using mlock() not even work with simple fork() [1] and we get reliable crashes when rebooting. Further, simply because we might be unmapping a single PTE of a large mlocked folio, we shouldn't zero out the whole folio. ... especially because the code can also *corrupt* urelated memory because kernel_init_pages(page, folio_nr_pages(folio)); Could end up writing outside of the actual folio if we work with a tail page. Let's revert it. Once there is agreement that this is the right approach, the issues were fixed and there was reasonable review and proper testing, we can consider it again. [1] https://lkml.kernel.org/r/4da9da2f-73e4-45fd-b62f-a8a513314057@redhat.com Link: https://lkml.kernel.org/r/20240605091710.38961-1-david@redhat.com Fixes: ba42b524a040 ("mm: init_mlocked_on_free_v3") Signed-off-by: David Hildenbrand Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/lkml/20240528151340.4282-1-00107082@163.com/ Reported-by: Lance Yang Closes: https://lkml.kernel.org/r/20240601140917.43562-1-ioworker0@gmail.com Acked-by: Lance Yang Cc: York Jasper Niebuhr Cc: Matthew Wilcox (Oracle) Cc: Kees Cook Signed-off-by: Andrew Morton --- .../admin-guide/kernel-parameters.txt | 6 --- include/linux/mm.h | 9 +--- mm/internal.h | 1 - mm/memory.c | 6 --- mm/mm_init.c | 43 +++---------------- mm/page_alloc.c | 2 +- security/Kconfig.hardening | 15 ------- 7 files changed, 9 insertions(+), 73 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b600df82669db..11e57ba2985cc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2192,12 +2192,6 @@ Format: 0 | 1 Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON. - init_mlocked_on_free= [MM] Fill freed userspace memory with zeroes if - it was mlock'ed and not explicitly munlock'ed - afterwards. - Format: 0 | 1 - Default set by CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON - init_pkru= [X86] Specify the default memory protection keys rights register contents for all processes. 0x55555554 by default (disallow access to all but pkey 0). Can diff --git a/include/linux/mm.h b/include/linux/mm.h index 9849dfda44d43..9a5652c5fadd5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3776,14 +3776,7 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); static inline bool want_init_on_free(void) { return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, - &init_on_free); -} - -DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free); -static inline bool want_init_mlocked_on_free(void) -{ - return static_branch_maybe(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, - &init_mlocked_on_free); + &init_on_free); } extern bool _debug_pagealloc_enabled_early; diff --git a/mm/internal.h b/mm/internal.h index b2c75b12014e7..c72c306761a48 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -588,7 +588,6 @@ extern void __putback_isolated_page(struct page *page, unsigned int order, extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order); -extern void kernel_init_pages(struct page *page, int numpages); /* * This will have no effect, other than possibly generating a warning, if the diff --git a/mm/memory.c b/mm/memory.c index 0f47a533014e4..2bc8032a30a2f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1507,12 +1507,6 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, if (unlikely(folio_mapcount(folio) < 0)) print_bad_pte(vma, addr, ptent, page); } - - if (want_init_mlocked_on_free() && folio_test_mlocked(folio) && - !delay_rmap && folio_test_anon(folio)) { - kernel_init_pages(page, folio_nr_pages(folio)); - } - if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { *force_flush = true; *force_break = true; diff --git a/mm/mm_init.c b/mm/mm_init.c index f72b852bd5b8e..3ec04933f7fd8 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2523,9 +2523,6 @@ EXPORT_SYMBOL(init_on_alloc); DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); EXPORT_SYMBOL(init_on_free); -DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free); -EXPORT_SYMBOL(init_mlocked_on_free); - static bool _init_on_alloc_enabled_early __read_mostly = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); static int __init early_init_on_alloc(char *buf) @@ -2543,14 +2540,6 @@ static int __init early_init_on_free(char *buf) } early_param("init_on_free", early_init_on_free); -static bool _init_mlocked_on_free_enabled_early __read_mostly - = IS_ENABLED(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON); -static int __init early_init_mlocked_on_free(char *buf) -{ - return kstrtobool(buf, &_init_mlocked_on_free_enabled_early); -} -early_param("init_mlocked_on_free", early_init_mlocked_on_free); - DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); /* @@ -2578,21 +2567,12 @@ static void __init mem_debugging_and_hardening_init(void) } #endif - if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early || - _init_mlocked_on_free_enabled_early) && + if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) && page_poisoning_requested) { pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " - "will take precedence over init_on_alloc, init_on_free " - "and init_mlocked_on_free\n"); + "will take precedence over init_on_alloc and init_on_free\n"); _init_on_alloc_enabled_early = false; _init_on_free_enabled_early = false; - _init_mlocked_on_free_enabled_early = false; - } - - if (_init_mlocked_on_free_enabled_early && _init_on_free_enabled_early) { - pr_info("mem auto-init: init_on_free is on, " - "will take precedence over init_mlocked_on_free\n"); - _init_mlocked_on_free_enabled_early = false; } if (_init_on_alloc_enabled_early) { @@ -2609,17 +2589,9 @@ static void __init mem_debugging_and_hardening_init(void) static_branch_disable(&init_on_free); } - if (_init_mlocked_on_free_enabled_early) { - want_check_pages = true; - static_branch_enable(&init_mlocked_on_free); - } else { - static_branch_disable(&init_mlocked_on_free); - } - - if (IS_ENABLED(CONFIG_KMSAN) && (_init_on_alloc_enabled_early || - _init_on_free_enabled_early || _init_mlocked_on_free_enabled_early)) - pr_info("mem auto-init: please make sure init_on_alloc, init_on_free and " - "init_mlocked_on_free are disabled when running KMSAN\n"); + if (IS_ENABLED(CONFIG_KMSAN) && + (_init_on_alloc_enabled_early || _init_on_free_enabled_early)) + pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n"); #ifdef CONFIG_DEBUG_PAGEALLOC if (debug_pagealloc_enabled()) { @@ -2658,10 +2630,9 @@ static void __init report_meminit(void) else stack = "off"; - pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s, mlocked free:%s\n", + pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n", stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off", - want_init_on_free() ? "on" : "off", - want_init_mlocked_on_free() ? "on" : "off"); + want_init_on_free() ? "on" : "off"); if (want_init_on_free()) pr_info("mem auto-init: clearing system memory may take some time...\n"); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 222299b5c0e6a..7300aa9f14b0b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1016,7 +1016,7 @@ static inline bool should_skip_kasan_poison(struct page *page) return page_kasan_tag(page) == KASAN_TAG_KERNEL; } -void kernel_init_pages(struct page *page, int numpages) +static void kernel_init_pages(struct page *page, int numpages) { int i; diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening index effbf5982be10..2cff851ebfd7e 100644 --- a/security/Kconfig.hardening +++ b/security/Kconfig.hardening @@ -255,21 +255,6 @@ config INIT_ON_FREE_DEFAULT_ON touching "cold" memory areas. Most cases see 3-5% impact. Some synthetic workloads have measured as high as 8%. -config INIT_MLOCKED_ON_FREE_DEFAULT_ON - bool "Enable mlocked memory zeroing on free" - depends on !KMSAN - help - This config has the effect of setting "init_mlocked_on_free=1" - on the kernel command line. If it is enabled, all mlocked process - memory is zeroed when freed. This restriction to mlocked memory - improves performance over "init_on_free" but can still be used to - protect confidential data like key material from content exposures - to other processes, as well as live forensics and cold boot attacks. - Any non-mlocked memory is not cleared before it is reassigned. This - configuration can be overwritten by setting "init_mlocked_on_free=0" - on the command line. The "init_on_free" boot option takes - precedence over "init_mlocked_on_free". - config CC_HAS_ZERO_CALL_USED_REGS def_bool $(cc-option,-fzero-call-used-regs=used-gpr) # https://github.com/ClangBuiltLinux/linux/issues/1766 From 3ab85f4046c12fb773084c2974cd7bbe8a3e2e68 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 2 Jun 2024 21:55:10 +0100 Subject: [PATCH 110/272] MAINTAINERS: remove Lorenzo as vmalloc reviewer I haven't had the bandwidth to review vmalloc patches recently and I suspect I won't be able to do so consistently moving forwards, so I think it's best if I remove myself as reviewer for the time being. Link: https://lkml.kernel.org/r/20240602205510.108807-1-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Cc: Baoquan He Cc: Christoph Hellwig Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index aacccb376c28a..3620c4e6b4690 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23974,7 +23974,6 @@ VMALLOC M: Andrew Morton R: Uladzislau Rezki R: Christoph Hellwig -R: Lorenzo Stoakes L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org From c944bf60c16a65ae812a59fd1b66f6c9e18c91c9 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 1 Jun 2024 16:38:31 -0700 Subject: [PATCH 111/272] lib/alloc_tag: do not register sysctl interface when CONFIG_SYSCTL=n Memory allocation profiling is trying to register sysctl interface even when CONFIG_SYSCTL=n, resulting in proc_do_static_key() being undefined. Prevent that by skipping sysctl registration for such configurations. Link: https://lkml.kernel.org/r/20240601233831.617124-1-surenb@google.com Fixes: 22d407b164ff ("lib: add allocation tagging support for memory allocation profiling") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405280616.wcOGWJEj-lkp@intel.com/ Acked-by: Vlastimil Babka Cc: Kent Overstreet Cc: Kees Cook Cc: Pasha Tatashin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/alloc_tag.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 11ed973ac359d..c347b8b72d78d 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -227,6 +227,7 @@ struct page_ext_operations page_alloc_tagging_ops = { }; EXPORT_SYMBOL(page_alloc_tagging_ops); +#ifdef CONFIG_SYSCTL static struct ctl_table memory_allocation_profiling_sysctls[] = { { .procname = "mem_profiling", @@ -241,6 +242,17 @@ static struct ctl_table memory_allocation_profiling_sysctls[] = { { } }; +static void __init sysctl_init(void) +{ + if (!mem_profiling_support) + memory_allocation_profiling_sysctls[0].mode = 0444; + + register_sysctl_init("vm", memory_allocation_profiling_sysctls); +} +#else /* CONFIG_SYSCTL */ +static inline void sysctl_init(void) {} +#endif /* CONFIG_SYSCTL */ + static int __init alloc_tag_init(void) { const struct codetag_type_desc desc = { @@ -253,9 +265,7 @@ static int __init alloc_tag_init(void) if (IS_ERR(alloc_tag_cttype)) return PTR_ERR(alloc_tag_cttype); - if (!mem_profiling_support) - memory_allocation_profiling_sysctls[0].mode = 0444; - register_sysctl_init("vm", memory_allocation_profiling_sysctls); + sysctl_init(); procfs_init(); return 0; From a273559e9eb68cb58c57803d76a1622b8324a878 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 1 Jun 2024 16:38:40 -0700 Subject: [PATCH 112/272] lib/alloc_tag: fix RCU imbalance in pgalloc_tag_get() put_page_tag_ref() should be called only when get_page_tag_ref() returns a valid reference because only in that case get_page_tag_ref() enters RCU read section while put_page_tag_ref() will call rcu_read_unlock() even if the provided reference is NULL. Fix pgalloc_tag_get() which does not follow this rule causing RCU imbalance. Add a warning in put_page_tag_ref() to catch any future mistakes. Link: https://lkml.kernel.org/r/20240601233840.617458-1-surenb@google.com Fixes: cc92eba1c88b ("mm: fix non-compound multi-order memory accounting in __free_pages") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202405271029.6d2f9c4c-lkp@intel.com Acked-by: Vlastimil Babka Cc: Kent Overstreet Cc: Kees Cook Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 86ba5d33e43bd..9cacadbd61f8c 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -37,6 +37,9 @@ static inline union codetag_ref *get_page_tag_ref(struct page *page) static inline void put_page_tag_ref(union codetag_ref *ref) { + if (WARN_ON(!ref)) + return; + page_ext_put(page_ext_from_codetag_ref(ref)); } @@ -102,9 +105,11 @@ static inline struct alloc_tag *pgalloc_tag_get(struct page *page) union codetag_ref *ref = get_page_tag_ref(page); alloc_tag_sub_check(ref); - if (ref && ref->ct) - tag = ct_to_alloc_tag(ref->ct); - put_page_tag_ref(ref); + if (ref) { + if (ref->ct) + tag = ct_to_alloc_tag(ref->ct); + put_page_tag_ref(ref); + } } return tag; From 6a50c9b512f7734bc356f4bd47885a6f7c98491a Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Fri, 7 Jun 2024 17:40:48 +0800 Subject: [PATCH 113/272] mm: huge_memory: fix misused mapping_large_folio_support() for anon folios When I did a large folios split test, a WARNING "[ 5059.122759][ T166] Cannot split file folio to non-0 order" was triggered. But the test cases are only for anonmous folios. while mapping_large_folio_support() is only reasonable for page cache folios. In split_huge_page_to_list_to_order(), the folio passed to mapping_large_folio_support() maybe anonmous folio. The folio_test_anon() check is missing. So the split of the anonmous THP is failed. This is also the same for shmem_mapping(). We'd better add a check for both. But the shmem_mapping() in __split_huge_page() is not involved, as for anonmous folios, the end parameter is set to -1, so (head[i].index >= end) is always false. shmem_mapping() is not called. Also add a VM_WARN_ON_ONCE() in mapping_large_folio_support() for anon mapping, So we can detect the wrong use more easily. THP folios maybe exist in the pagecache even the file system doesn't support large folio, it is because when CONFIG_TRANSPARENT_HUGEPAGE is enabled, khugepaged will try to collapse read-only file-backed pages to THP. But the mapping does not actually support multi order large folios properly. Using /sys/kernel/debug/split_huge_pages to verify this, with this patch, large anon THP is successfully split and the warning is ceased. Link: https://lkml.kernel.org/r/202406071740485174hcFl7jRxncsHDtI-Pz-o@zte.com.cn Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") Reviewed-by: Barry Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand Signed-off-by: Ran Xiaokai Cc: Michal Hocko Cc: xu xin Cc: Yang Yang Cc: Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 4 ++++ mm/huge_memory.c | 28 +++++++++++++++++----------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ee633712bba0b..59f1df0cde5a0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -381,6 +381,10 @@ static inline void mapping_set_large_folios(struct address_space *mapping) */ static inline bool mapping_large_folio_support(struct address_space *mapping) { + /* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */ + VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON, + "Anonymous mapping always supports large folio"); + return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 89932fd0f62e8..db7946a0a28c4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3009,30 +3009,36 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (new_order >= folio_order(folio)) return -EINVAL; - /* Cannot split anonymous THP to order-1 */ - if (new_order == 1 && folio_test_anon(folio)) { - VM_WARN_ONCE(1, "Cannot split to order-1 folio"); - return -EINVAL; - } - - if (new_order) { - /* Only swapping a whole PMD-mapped folio is supported */ - if (folio_test_swapcache(folio)) + if (folio_test_anon(folio)) { + /* order-1 is not supported for anonymous THP. */ + if (new_order == 1) { + VM_WARN_ONCE(1, "Cannot split to order-1 folio"); return -EINVAL; + } + } else if (new_order) { /* Split shmem folio to non-zero order not supported */ if (shmem_mapping(folio->mapping)) { VM_WARN_ONCE(1, "Cannot split shmem folio to non-0 order"); return -EINVAL; } - /* No split if the file system does not support large folio */ - if (!mapping_large_folio_support(folio->mapping)) { + /* + * No split if the file system does not support large folio. + * Note that we might still have THPs in such mappings due to + * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping + * does not actually support large folios properly. + */ + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + !mapping_large_folio_support(folio->mapping)) { VM_WARN_ONCE(1, "Cannot split file folio to non-0 order"); return -EINVAL; } } + /* Only swapping a whole PMD-mapped folio is supported */ + if (folio_test_swapcache(folio) && new_order) + return -EINVAL; is_hzp = is_huge_zero_folio(folio); if (is_hzp) { From 7fea700e04bd3f424c2d836e98425782f97b494e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 8 Jun 2024 14:06:16 +0200 Subject: [PATCH 114/272] zap_pid_ns_processes: clear TIF_NOTIFY_SIGNAL along with TIF_SIGPENDING kernel_wait4() doesn't sleep and returns -EINTR if there is no eligible child and signal_pending() is true. That is why zap_pid_ns_processes() clears TIF_SIGPENDING but this is not enough, it should also clear TIF_NOTIFY_SIGNAL to make signal_pending() return false and avoid a busy-wait loop. Link: https://lkml.kernel.org/r/20240608120616.GB7947@redhat.com Fixes: 12db8b690010 ("entry: Add support for TIF_NOTIFY_SIGNAL") Signed-off-by: Oleg Nesterov Reported-by: Rachel Menge Closes: https://lore.kernel.org/all/1386cd49-36d0-4a5c-85e9-bc42056a5a38@linux.microsoft.com/ Reviewed-by: Boqun Feng Tested-by: Wei Fu Reviewed-by: Jens Axboe Cc: Allen Pais Cc: Christian Brauner Cc: Frederic Weisbecker Cc: Joel Fernandes (Google) Cc: Joel Granados Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Mike Christie Cc: Neeraj Upadhyay Cc: Paul E. McKenney Cc: Steven Rostedt (Google) Cc: Zqiang Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/pid_namespace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index dc48fecfa1dce..25f3cf679b358 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -218,6 +218,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) */ do { clear_thread_flag(TIF_SIGPENDING); + clear_thread_flag(TIF_NOTIFY_SIGNAL); rc = kernel_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); From c1558bc57b8e5b4da5d821537cd30e2e660861d8 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Mon, 10 Jun 2024 11:27:43 +0200 Subject: [PATCH 115/272] gcov: add support for GCC 14 Using gcov on kernels compiled with GCC 14 results in truncated 16-byte long .gcda files with no usable data. To fix this, update GCOV_COUNTERS to match the value defined by GCC 14. Tested with GCC versions 14.1.0 and 13.2.0. Link: https://lkml.kernel.org/r/20240610092743.1609845-1-oberpar@linux.ibm.com Signed-off-by: Peter Oberparleiter Reported-by: Allison Henderson Reported-by: Chuck Lever III Tested-by: Chuck Lever Cc: Signed-off-by: Andrew Morton --- kernel/gcov/gcc_4_7.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 74a4ef1da9ad7..fd75b4a484d76 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,9 @@ #include #include "gcov.h" -#if (__GNUC__ >= 10) +#if (__GNUC__ >= 14) +#define GCOV_COUNTERS 9 +#elif (__GNUC__ >= 10) #define GCOV_COUNTERS 8 #elif (__GNUC__ >= 7) #define GCOV_COUNTERS 9 From 3afb76a66b5559a7b595155803ce23801558a7a9 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Thu, 6 Jun 2024 14:06:22 -0400 Subject: [PATCH 116/272] mm: mmap: allow for the maximum number of bits for randomizing mmap_base by default An ASLR regression was noticed [1] and tracked down to file-mapped areas being backed by THP in recent kernels. The 21-bit alignment constraint for such mappings reduces the entropy for randomizing the placement of 64-bit library mappings and breaks ASLR completely for 32-bit libraries. The reported issue is easily addressed by increasing vm.mmap_rnd_bits and vm.mmap_rnd_compat_bits. This patch just provides a simple way to set ARCH_MMAP_RND_BITS and ARCH_MMAP_RND_COMPAT_BITS to their maximum values allowed by the architecture at build time. [1] https://zolutal.github.io/aslrnt/ [akpm@linux-foundation.org: default to `y' if 32-bit, per Rafael] Link: https://lkml.kernel.org/r/20240606180622.102099-1-aquini@redhat.com Fixes: 1854bc6e2420 ("mm/readahead: Align file mappings for non-DAX") Signed-off-by: Rafael Aquini Cc: Arnd Bergmann Cc: Heiko Carstens Cc: Mike Rapoport (IBM) Cc: Paul E. McKenney Cc: Petr Mladek Cc: Samuel Holland Cc: Signed-off-by: Andrew Morton --- arch/Kconfig | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 975dd22a2dbd2..3e2a63772b3db 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1046,10 +1046,21 @@ config ARCH_MMAP_RND_BITS_MAX config ARCH_MMAP_RND_BITS_DEFAULT int +config FORCE_MAX_MMAP_RND_BITS + bool "Force maximum number of bits to use for ASLR of mmap base address" + default y if !64BIT + help + ARCH_MMAP_RND_BITS and ARCH_MMAP_RND_COMPAT_BITS represent the number + of bits to use for ASLR and if no custom value is assigned (EXPERT) + then the architecture's lower bound (minimum) value is assumed. + This toggle changes that default assumption to assume the arch upper + bound (maximum) value instead. + config ARCH_MMAP_RND_BITS int "Number of bits to use for ASLR of mmap base address" if EXPERT range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT + default ARCH_MMAP_RND_BITS_MAX if FORCE_MAX_MMAP_RND_BITS default ARCH_MMAP_RND_BITS_MIN depends on HAVE_ARCH_MMAP_RND_BITS help @@ -1084,6 +1095,7 @@ config ARCH_MMAP_RND_COMPAT_BITS int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT + default ARCH_MMAP_RND_COMPAT_BITS_MAX if FORCE_MAX_MMAP_RND_BITS default ARCH_MMAP_RND_COMPAT_BITS_MIN depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS help From 653c5c75115c1e23b8393c1cb1ad2d6f6712742f Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Fri, 7 Jun 2024 20:35:41 +0000 Subject: [PATCH 117/272] mm/memfd: add documentation for MFD_NOEXEC_SEAL MFD_EXEC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When MFD_NOEXEC_SEAL was introduced, there was one big mistake: it didn't have proper documentation. This led to a lot of confusion, especially about whether or not memfd created with the MFD_NOEXEC_SEAL flag is sealable. Before MFD_NOEXEC_SEAL, memfd had to explicitly set MFD_ALLOW_SEALING to be sealable, so it's a fair question. As one might have noticed, unlike other flags in memfd_create, MFD_NOEXEC_SEAL is actually a combination of multiple flags. The idea is to make it easier to use memfd in the most common way, which is NOEXEC + F_SEAL_EXEC + MFD_ALLOW_SEALING. This works with sysctl vm.noexec to help existing applications move to a more secure way of using memfd. Proposals have been made to put MFD_NOEXEC_SEAL non-sealable, unless MFD_ALLOW_SEALING is set, to be consistent with other flags [1], Those are based on the viewpoint that each flag is an atomic unit, which is a reasonable assumption. However, MFD_NOEXEC_SEAL was designed with the intent of promoting the most secure method of using memfd, therefore a combination of multiple functionalities into one bit. Furthermore, the MFD_NOEXEC_SEAL has been added for more than one year, and multiple applications and distributions have backported and utilized it. Altering ABI now presents a degree of risk and may lead to disruption. MFD_NOEXEC_SEAL is a new flag, and applications must change their code to use it. There is no backward compatibility problem. When sysctl vm.noexec == 1 or 2, applications that don't set MFD_NOEXEC_SEAL or MFD_EXEC will get MFD_NOEXEC_SEAL memfd. And old-application might break, that is by-design, in such a system vm.noexec = 0 shall be used. Also no backward compatibility problem. I propose to include this documentation patch to assist in clarifying the semantics of MFD_NOEXEC_SEAL, thereby preventing any potential future confusion. Finally, I would like to express my gratitude to David Rheinsberg and Barnabás Pőcze for initiating the discussion on the topic of sealability. [1] https://lore.kernel.org/lkml/20230714114753.170814-1-david@readahead.eu/ [jeffxu@chromium.org: updates per Randy] Link: https://lkml.kernel.org/r/20240611034903.3456796-2-jeffxu@chromium.org [jeffxu@chromium.org: v3] Link: https://lkml.kernel.org/r/20240611231409.3899809-2-jeffxu@chromium.org Link: https://lkml.kernel.org/r/20240607203543.2151433-2-jeffxu@google.com Signed-off-by: Jeff Xu Reviewed-by: Randy Dunlap Cc: Aleksa Sarai Cc: Barnabás Pőcze Cc: Daniel Verkamp Cc: David Rheinsberg Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jorge Lucangeli Obes Cc: Kees Cook Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/userspace-api/index.rst | 1 + Documentation/userspace-api/mfd_noexec.rst | 86 ++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 Documentation/userspace-api/mfd_noexec.rst diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index 5926115ec0ed8..8a251d71fa6e1 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -32,6 +32,7 @@ Security-related interfaces seccomp_filter landlock lsm + mfd_noexec spec_ctrl tee diff --git a/Documentation/userspace-api/mfd_noexec.rst b/Documentation/userspace-api/mfd_noexec.rst new file mode 100644 index 0000000000000..7afcc480e38f0 --- /dev/null +++ b/Documentation/userspace-api/mfd_noexec.rst @@ -0,0 +1,86 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================== +Introduction of non-executable mfd +================================== +:Author: + Daniel Verkamp + Jeff Xu + +:Contributor: + Aleksa Sarai + +Since Linux introduced the memfd feature, memfds have always had their +execute bit set, and the memfd_create() syscall doesn't allow setting +it differently. + +However, in a secure-by-default system, such as ChromeOS, (where all +executables should come from the rootfs, which is protected by verified +boot), this executable nature of memfd opens a door for NoExec bypass +and enables “confused deputy attack”. E.g, in VRP bug [1]: cros_vm +process created a memfd to share the content with an external process, +however the memfd is overwritten and used for executing arbitrary code +and root escalation. [2] lists more VRP of this kind. + +On the other hand, executable memfd has its legit use: runc uses memfd’s +seal and executable feature to copy the contents of the binary then +execute them. For such a system, we need a solution to differentiate runc's +use of executable memfds and an attacker's [3]. + +To address those above: + - Let memfd_create() set X bit at creation time. + - Let memfd be sealed for modifying X bit when NX is set. + - Add a new pid namespace sysctl: vm.memfd_noexec to help applications in + migrating and enforcing non-executable MFD. + +User API +======== +``int memfd_create(const char *name, unsigned int flags)`` + +``MFD_NOEXEC_SEAL`` + When MFD_NOEXEC_SEAL bit is set in the ``flags``, memfd is created + with NX. F_SEAL_EXEC is set and the memfd can't be modified to + add X later. MFD_ALLOW_SEALING is also implied. + This is the most common case for the application to use memfd. + +``MFD_EXEC`` + When MFD_EXEC bit is set in the ``flags``, memfd is created with X. + +Note: + ``MFD_NOEXEC_SEAL`` implies ``MFD_ALLOW_SEALING``. In case that + an app doesn't want sealing, it can add F_SEAL_SEAL after creation. + + +Sysctl: +======== +``pid namespaced sysctl vm.memfd_noexec`` + +The new pid namespaced sysctl vm.memfd_noexec has 3 values: + + - 0: MEMFD_NOEXEC_SCOPE_EXEC + memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL acts like + MFD_EXEC was set. + + - 1: MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL + memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL acts like + MFD_NOEXEC_SEAL was set. + + - 2: MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED + memfd_create() without MFD_NOEXEC_SEAL will be rejected. + +The sysctl allows finer control of memfd_create for old software that +doesn't set the executable bit; for example, a container with +vm.memfd_noexec=1 means the old software will create non-executable memfd +by default while new software can create executable memfd by setting +MFD_EXEC. + +The value of vm.memfd_noexec is passed to child namespace at creation +time. In addition, the setting is hierarchical, i.e. during memfd_create, +we will search from current ns to root ns and use the most restrictive +setting. + +[1] https://crbug.com/1305267 + +[2] https://bugs.chromium.org/p/chromium/issues/list?q=type%3Dbug-security%20memfd%20escalation&can=1 + +[3] https://lwn.net/Articles/781013/ From e7d2a28bd0b27e43bff3f516ee0607d776b019f4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 5 Jun 2024 23:36:12 +0100 Subject: [PATCH 118/272] selftests: mm: make map_fixed_noreplace test names stable KTAP parsers interpret the output of ksft_test_result_*() as being the name of the test. The map_fixed_noreplace test uses a dynamically allocated base address for the mmap()s that it tests and currently includes this in the test names that it logs so the test names that are logged are not stable between runs. It also uses multiples of PAGE_SIZE which mean that runs for kernels with different PAGE_SIZE configurations can't be directly compared. Both these factors cause issues for CI systems when interpreting and displaying results. Fix this by replacing the current test names with fixed strings describing the intent of the mappings that are logged, the existing messages with the actual addresses and sizes are retained as diagnostic prints to aid in debugging. Link: https://lkml.kernel.org/r/20240605-kselftest-mm-fixed-noreplace-v1-1-a235db8b9be9@kernel.org Fixes: 4838cf70e539 ("selftests/mm: map_fixed_noreplace: conform test to TAP format output") Signed-off-by: Mark Brown Reviewed-by: Ryan Roberts Cc: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/mm/map_fixed_noreplace.c | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/mm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c index b74813fdc9514..d53de2486080e 100644 --- a/tools/testing/selftests/mm/map_fixed_noreplace.c +++ b/tools/testing/selftests/mm/map_fixed_noreplace.c @@ -67,7 +67,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error: munmap failed!?\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 5*PAGE_SIZE at base\n"); addr = base_addr + page_size; size = 3 * page_size; @@ -76,7 +77,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error: first mmap() failed unexpectedly\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 3*PAGE_SIZE at base+PAGE_SIZE\n"); /* * Exact same mapping again: @@ -93,7 +95,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:1: mmap() succeeded when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 5*PAGE_SIZE at base\n"); /* * Second mapping contained within first: @@ -111,7 +114,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:2: mmap() succeeded when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 2*PAGE_SIZE at base+PAGE_SIZE\n"); /* * Overlap end of existing mapping: @@ -128,7 +132,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:3: mmap() succeeded when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 2*PAGE_SIZE at base+(3*PAGE_SIZE)\n"); /* * Overlap start of existing mapping: @@ -145,7 +150,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:4: mmap() succeeded when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() 2*PAGE_SIZE bytes at base\n"); /* * Adjacent to start of existing mapping: @@ -162,7 +168,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:5: mmap() failed when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() PAGE_SIZE at base\n"); /* * Adjacent to end of existing mapping: @@ -179,7 +186,8 @@ int main(void) dump_maps(); ksft_exit_fail_msg("Error:6: mmap() failed when it shouldn't have\n"); } - ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + ksft_test_result_pass("mmap() PAGE_SIZE at base+(4*PAGE_SIZE)\n"); addr = base_addr; size = 5 * page_size; From 8e279f970b5cb0628f856b6735e2e47b4da9f76e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jun 2024 22:06:20 -0700 Subject: [PATCH 119/272] mm/migrate: fix kernel BUG at mm/compaction.c:2761! I hit the VM_BUG_ON(!list_empty(&cc->migratepages)) in compact_zone(); and if DEBUG_VM were off, then pages would be lost on a local list. Our convention is that if migrate_pages() reports complete success (0), then the migratepages list will be empty; but if it reports an error or some pages remaining, then its caller must putback_movable_pages(). There's a new case in which migrate_pages() has been reporting complete success, but returning with pages left on the migratepages list: when migrate_pages_batch() successfully split a folio on the deferred list, but then the "Failure isn't counted" call does not dispose of them all. Since that block is expecting the large folio to have been counted as 1 failure already, and since the return code is later adjusted to success whenever the returned list is found empty, the simple way to fix this safely is to count splitting the deferred folio as "a failure". Link: https://lkml.kernel.org/r/46c948b4-4dd8-6e03-4c7b-ce4e81cfa536@google.com Fixes: 7262f208ca68 ("mm/migrate: split source folio if it is on deferred split list") Signed-off-by: Hugh Dickins Cc: Baolin Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index dd04f578c19c3..2cc5a68f68435 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1654,7 +1654,12 @@ static int migrate_pages_batch(struct list_head *from, /* * The rare folio on the deferred split list should - * be split now. It should not count as a failure. + * be split now. It should not count as a failure: + * but increment nr_failed because, without doing so, + * migrate_pages() may report success with (split but + * unmigrated) pages still on its fromlist; whereas it + * always reports success when its fromlist is empty. + * * Only check it without removing it from the list. * Since the folio can be on deferred_split_scan() * local list and removing it can cause the local list @@ -1669,6 +1674,7 @@ static int migrate_pages_batch(struct list_head *from, if (nr_pages > 2 && !list_empty(&folio->_deferred_list)) { if (try_split_folio(folio, split_folios) == 0) { + nr_failed++; stats->nr_thp_split += is_thp; stats->nr_split++; continue; From cfdd12b48202398a879e8bc4e7fa023f4d473f62 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 12 Jun 2024 20:28:22 +0800 Subject: [PATCH 120/272] mm: fix possible OOB in numa_rebuild_large_mapping() The large folio is mapped with folio size(not greater PMD_SIZE) aligned virtual address during the pagefault, ie, 'addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE)' in do_anonymous_page(). But after the mremap(), the virtual address only requires PAGE_SIZE alignment. Also pte is moved to new in move_page_tables(), then traversal of the new pte in the numa_rebuild_large_mapping() could hit the following issue, Unable to handle kernel paging request at virtual address 00000a80c021a788 Mem abort info: ESR = 0x0000000096000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x04: level 0 translation fault Data abort info: ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=00002040341a6000 [00000a80c021a788] pgd=0000000000000000, p4d=0000000000000000 Internal error: Oops: 0000000096000004 [#1] SMP ... CPU: 76 PID: 15187 Comm: git Kdump: loaded Tainted: G W 6.10.0-rc2+ #209 Hardware name: Huawei TaiShan 2280 V2/BC82AMDD, BIOS 1.79 08/21/2021 pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : numa_rebuild_large_mapping+0x338/0x638 lr : numa_rebuild_large_mapping+0x320/0x638 sp : ffff8000b41c3b00 x29: ffff8000b41c3b30 x28: ffff8000812a0000 x27: 00000000000a8000 x26: 00000000000000a8 x25: 0010000000000001 x24: ffff20401c7170f0 x23: 0000ffff33a1e000 x22: 0000ffff33a76000 x21: ffff20400869eca0 x20: 0000ffff33976000 x19: 00000000000000a8 x18: ffffffffffffffff x17: 0000000000000000 x16: 0000000000000020 x15: ffff8000b41c36a8 x14: 0000000000000000 x13: 205d373831353154 x12: 5b5d333331363732 x11: 000000000011ff78 x10: 000000000011ff10 x9 : ffff800080273f30 x8 : 000000320400869e x7 : c0000000ffffd87f x6 : 00000000001e6ba8 x5 : ffff206f3fb5af88 x4 : 0000000000000000 x3 : 0000000000000000 x2 : 0000000000000000 x1 : fffffdffc0000000 x0 : 00000a80c021a780 Call trace: numa_rebuild_large_mapping+0x338/0x638 do_numa_page+0x3e4/0x4e0 handle_pte_fault+0x1bc/0x238 __handle_mm_fault+0x20c/0x400 handle_mm_fault+0xa8/0x288 do_page_fault+0x124/0x498 do_translation_fault+0x54/0x80 do_mem_abort+0x4c/0xa8 el0_da+0x40/0x110 el0t_64_sync_handler+0xe4/0x158 el0t_64_sync+0x188/0x190 Fix it by making the start and end not only within the vma range, but also within the page table range. Link: https://lkml.kernel.org/r/20240612122822.4033433-1-wangkefeng.wang@huawei.com Fixes: d2136d749d76 ("mm: support multi-size THP numa balancing") Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Reviewed-by: Baolin Wang Cc: "Huang, Ying" Cc: John Hubbard Cc: Liu Shixin Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/memory.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 2bc8032a30a2f..25a77c4fe4a03 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5100,10 +5100,16 @@ static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_stru bool ignore_writable, bool pte_write_upgrade) { int nr = pte_pfn(fault_pte) - folio_pfn(folio); - unsigned long start = max(vmf->address - nr * PAGE_SIZE, vma->vm_start); - unsigned long end = min(vmf->address + (folio_nr_pages(folio) - nr) * PAGE_SIZE, vma->vm_end); - pte_t *start_ptep = vmf->pte - (vmf->address - start) / PAGE_SIZE; - unsigned long addr; + unsigned long start, end, addr = vmf->address; + unsigned long addr_start = addr - (nr << PAGE_SHIFT); + unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE); + pte_t *start_ptep; + + /* Stay within the VMA and within the page table. */ + start = max3(addr_start, pt_start, vma->vm_start); + end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE, + vma->vm_end); + start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT); /* Restore all PTEs' mapping of the large folio */ for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) { From 0b1ef4fde7a24909ff2afacffd0d6afa28b73652 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 23 May 2024 09:21:39 -0400 Subject: [PATCH 121/272] mm/debug_vm_pgtable: drop RANDOM_ORVALUE trick Macro RANDOM_ORVALUE was used to make sure the pgtable entry will be populated with !none data in clear tests. The RANDOM_ORVALUE tried to cover mostly all the bits in a pgtable entry, even if there's no discussion on whether all the bits will be vaild. Both S390 and PPC64 have their own masks to avoid touching some bits. Now it's the turn for x86_64. The issue is there's a recent report from Mikhail Gavrilov showing that this can cause a warning with the newly added pte set check in commit 8430557fc5 on writable v.s. userfaultfd-wp bit, even though the check itself was valid, the random pte is not. We can choose to mask more bits out. However the need to have such random bits setup is questionable, as now it's already guaranteed to be true on below: - For pte level, the pgtable entry will be installed with value from pfn_pte(), where pfn points to a valid page. Hence the pte will be !none already if populated with pfn_pte(). - For upper-than-pte level, the pgtable entry should contain a directory entry always, which is also !none. All the cases look like good enough to test a pxx_clear() helper. Instead of extending the bitmask, drop the "set random bits" trick completely. Add some warning guards to make sure the entries will be !none before clear(). Link: https://lkml.kernel.org/r/20240523132139.289719-1-peterx@redhat.com Fixes: 8430557fc584 ("mm/page_table_check: support userfault wr-protect entries") Signed-off-by: Peter Xu Reported-by: Mikhail Gavrilov Link: https://lore.kernel.org/r/CABXGCsMB9A8-X+Np_Q+fWLURYL_0t3Y-MdoNabDM-Lzk58-DGA@mail.gmail.com Tested-by: Mikhail Gavrilov Reviewed-by: Pasha Tatashin Acked-by: David Hildenbrand Cc: Aneesh Kumar K.V Cc: Gavin Shan Cc: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/debug_vm_pgtable.c | 31 +++++-------------------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index b104a353b532b..e4969fb54da34 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -40,22 +40,7 @@ * Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics * expectations that are being validated here. All future changes in here * or the documentation need to be in sync. - * - * On s390 platform, the lower 4 bits are used to identify given page table - * entry type. But these bits might affect the ability to clear entries with - * pxx_clear() because of how dynamic page table folding works on s390. So - * while loading up the entries do not change the lower 4 bits. It does not - * have affect any other platform. Also avoid the 62nd bit on ppc64 that is - * used to mark a pte entry. */ -#define S390_SKIP_MASK GENMASK(3, 0) -#if __BITS_PER_LONG == 64 -#define PPC64_SKIP_MASK GENMASK(62, 62) -#else -#define PPC64_SKIP_MASK 0x0 -#endif -#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK) -#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK) #define RANDOM_NZVALUE GENMASK(7, 0) struct pgtable_debug_args { @@ -511,8 +496,7 @@ static void __init pud_clear_tests(struct pgtable_debug_args *args) return; pr_debug("Validating PUD clear\n"); - pud = __pud(pud_val(pud) | RANDOM_ORVALUE); - WRITE_ONCE(*args->pudp, pud); + WARN_ON(pud_none(pud)); pud_clear(args->pudp); pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); @@ -548,8 +532,7 @@ static void __init p4d_clear_tests(struct pgtable_debug_args *args) return; pr_debug("Validating P4D clear\n"); - p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE); - WRITE_ONCE(*args->p4dp, p4d); + WARN_ON(p4d_none(p4d)); p4d_clear(args->p4dp); p4d = READ_ONCE(*args->p4dp); WARN_ON(!p4d_none(p4d)); @@ -582,8 +565,7 @@ static void __init pgd_clear_tests(struct pgtable_debug_args *args) return; pr_debug("Validating PGD clear\n"); - pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE); - WRITE_ONCE(*args->pgdp, pgd); + WARN_ON(pgd_none(pgd)); pgd_clear(args->pgdp); pgd = READ_ONCE(*args->pgdp); WARN_ON(!pgd_none(pgd)); @@ -634,10 +616,8 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args) if (WARN_ON(!args->ptep)) return; -#ifndef CONFIG_RISCV - pte = __pte(pte_val(pte) | RANDOM_ORVALUE); -#endif set_pte_at(args->mm, args->vaddr, args->ptep, pte); + WARN_ON(pte_none(pte)); flush_dcache_page(page); barrier(); ptep_clear(args->mm, args->vaddr, args->ptep); @@ -650,8 +630,7 @@ static void __init pmd_clear_tests(struct pgtable_debug_args *args) pmd_t pmd = READ_ONCE(*args->pmdp); pr_debug("Validating PMD clear\n"); - pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE); - WRITE_ONCE(*args->pmdp, pmd); + WARN_ON(pmd_none(pmd)); pmd_clear(args->pmdp); pmd = READ_ONCE(*args->pmdp); WARN_ON(!pmd_none(pmd)); From 9094b4a1c76cfe84b906cc152bab34d4ba26fa5c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 13 Jun 2024 16:21:19 +0800 Subject: [PATCH 122/272] mm: shmem: fix getting incorrect lruvec when replacing a shmem folio When testing shmem swapin, I encountered the warning below on my machine. The reason is that replacing an old shmem folio with a new one causes mem_cgroup_migrate() to clear the old folio's memcg data. As a result, the old folio cannot get the correct memcg's lruvec needed to remove itself from the LRU list when it is being freed. This could lead to possible serious problems, such as LRU list crashes due to holding the wrong LRU lock, and incorrect LRU statistics. To fix this issue, we can fallback to use the mem_cgroup_replace_folio() to replace the old shmem folio. [ 5241.100311] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x5d9960 [ 5241.100317] head: order:4 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 [ 5241.100319] flags: 0x17fffe0000040068(uptodate|lru|head|swapbacked|node=0|zone=2|lastcpupid=0x3ffff) [ 5241.100323] raw: 17fffe0000040068 fffffdffd6687948 fffffdffd69ae008 0000000000000000 [ 5241.100325] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 5241.100326] head: 17fffe0000040068 fffffdffd6687948 fffffdffd69ae008 0000000000000000 [ 5241.100327] head: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 5241.100328] head: 17fffe0000000204 fffffdffd6665801 ffffffffffffffff 0000000000000000 [ 5241.100329] head: 0000000a00000010 0000000000000000 00000000ffffffff 0000000000000000 [ 5241.100330] page dumped because: VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled()) [ 5241.100338] ------------[ cut here ]------------ [ 5241.100339] WARNING: CPU: 19 PID: 78402 at include/linux/memcontrol.h:775 folio_lruvec_lock_irqsave+0x140/0x150 [...] [ 5241.100374] pc : folio_lruvec_lock_irqsave+0x140/0x150 [ 5241.100375] lr : folio_lruvec_lock_irqsave+0x138/0x150 [ 5241.100376] sp : ffff80008b38b930 [...] [ 5241.100398] Call trace: [ 5241.100399] folio_lruvec_lock_irqsave+0x140/0x150 [ 5241.100401] __page_cache_release+0x90/0x300 [ 5241.100404] __folio_put+0x50/0x108 [ 5241.100406] shmem_replace_folio+0x1b4/0x240 [ 5241.100409] shmem_swapin_folio+0x314/0x528 [ 5241.100411] shmem_get_folio_gfp+0x3b4/0x930 [ 5241.100412] shmem_fault+0x74/0x160 [ 5241.100414] __do_fault+0x40/0x218 [ 5241.100417] do_shared_fault+0x34/0x1b0 [ 5241.100419] do_fault+0x40/0x168 [ 5241.100420] handle_pte_fault+0x80/0x228 [ 5241.100422] __handle_mm_fault+0x1c4/0x440 [ 5241.100424] handle_mm_fault+0x60/0x1f0 [ 5241.100426] do_page_fault+0x120/0x488 [ 5241.100429] do_translation_fault+0x4c/0x68 [ 5241.100431] do_mem_abort+0x48/0xa0 [ 5241.100434] el0_da+0x38/0xc0 [ 5241.100436] el0t_64_sync_handler+0x68/0xc0 [ 5241.100437] el0t_64_sync+0x14c/0x150 [ 5241.100439] ---[ end trace 0000000000000000 ]--- [baolin.wang@linux.alibaba.com: remove less helpful comments, per Matthew] Link: https://lkml.kernel.org/r/ccad3fe1375b468ebca3227b6b729f3eaf9d8046.1718423197.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/3c11000dd6c1df83015a8321a859e9775ebbc23e.1718266112.git.baolin.wang@linux.alibaba.com Fixes: 85ce2c517ade ("memcontrol: only transfer the memcg data for migration") Signed-off-by: Baolin Wang Reviewed-by: Shakeel Butt Cc: Matthew Wilcox (Oracle) Cc: Hugh Dickins Cc: Johannes Weiner Cc: Nhat Pham Cc: Michal Hocko Cc: Roman Gushchin Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- mm/memcontrol.c | 3 +-- mm/shmem.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 36793e509f470..71fe2a95b8bd3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7745,8 +7745,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios) * @new: Replacement folio. * * Charge @new as a replacement folio for @old. @old will - * be uncharged upon free. This is only used by the page cache - * (in replace_page_cache_folio()). + * be uncharged upon free. * * Both folios must be locked, @new->mapping must be set up. */ diff --git a/mm/shmem.c b/mm/shmem.c index f5d60436b604a..a8b181a634029 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1786,7 +1786,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, xa_lock_irq(&swap_mapping->i_pages); error = shmem_replace_entry(swap_mapping, swap_index, old, new); if (!error) { - mem_cgroup_migrate(old, new); + mem_cgroup_replace_folio(old, new); __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); __lruvec_stat_mod_folio(new, NR_SHMEM, 1); __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); From 01c8f9806bde438ca1c8cbbc439f0a14a6694f6c Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Tue, 11 Jun 2024 15:32:29 +0200 Subject: [PATCH 123/272] kcov: don't lose track of remote references during softirqs In kcov_remote_start()/kcov_remote_stop(), we swap the previous KCOV metadata of the current task into a per-CPU variable. However, the kcov_mode_enabled(mode) check is not sufficient in the case of remote KCOV coverage: current->kcov_mode always remains KCOV_MODE_DISABLED for remote KCOV objects. If the original task that has invoked the KCOV_REMOTE_ENABLE ioctl happens to get interrupted and kcov_remote_start() is called, it ultimately leads to kcov_remote_stop() NOT restoring the original KCOV reference. So when the task exits, all registered remote KCOV handles remain active forever. The most uncomfortable effect (at least for syzkaller) is that the bug prevents the reuse of the same /sys/kernel/debug/kcov descriptor. If we obtain it in the parent process and then e.g. drop some capabilities and continuously fork to execute individual programs, at some point current->kcov of the forked process is lost, kcov_task_exit() takes no action, and all KCOV_REMOTE_ENABLE ioctls calls from subsequent forks fail. And, yes, the efficiency is also affected if we keep on losing remote kcov objects. a) kcov_remote_map keeps on growing forever. b) (If I'm not mistaken), we're also not freeing the memory referenced by kcov->area. Fix it by introducing a special kcov_mode that is assigned to the task that owns a KCOV remote object. It makes kcov_mode_enabled() return true and yet does not trigger coverage collection in __sanitizer_cov_trace_pc() and write_comp_data(). [nogikh@google.com: replace WRITE_ONCE() with an ordinary assignment] Link: https://lkml.kernel.org/r/20240614171221.2837584-1-nogikh@google.com Link: https://lkml.kernel.org/r/20240611133229.527822-1-nogikh@google.com Fixes: 5ff3b30ab57d ("kcov: collect coverage from interrupts") Signed-off-by: Aleksandr Nogikh Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Tested-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Arnd Bergmann Cc: Marco Elver Cc: Signed-off-by: Andrew Morton --- include/linux/kcov.h | 2 ++ kernel/kcov.c | 1 + 2 files changed, 3 insertions(+) diff --git a/include/linux/kcov.h b/include/linux/kcov.h index b851ba415e03f..3b479a3d235a9 100644 --- a/include/linux/kcov.h +++ b/include/linux/kcov.h @@ -21,6 +21,8 @@ enum kcov_mode { KCOV_MODE_TRACE_PC = 2, /* Collecting comparison operands mode. */ KCOV_MODE_TRACE_CMP = 3, + /* The process owns a KCOV remote reference. */ + KCOV_MODE_REMOTE = 4, }; #define KCOV_IN_CTXSW (1 << 30) diff --git a/kernel/kcov.c b/kernel/kcov.c index c3124f6d5536b..f0a69d402066e 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -632,6 +632,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, return -EINVAL; kcov->mode = mode; t->kcov = kcov; + t->kcov_mode = KCOV_MODE_REMOTE; kcov->t = t; kcov->remote = true; kcov->remote_size = remote_arg->area_size; From a986fa57fd81a1430e00b3c6cf8a325d6f894a63 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 14 Jun 2024 22:29:10 +1000 Subject: [PATCH 124/272] KVM: PPC: Book3S HV: Prevent UAF in kvm_spapr_tce_attach_iommu_group() Al reported a possible use-after-free (UAF) in kvm_spapr_tce_attach_iommu_group(). It looks up `stt` from tablefd, but then continues to use it after doing fdput() on the returned fd. After the fdput() the tablefd is free to be closed by another thread. The close calls kvm_spapr_tce_release() and then release_spapr_tce_table() (via call_rcu()) which frees `stt`. Although there are calls to rcu_read_lock() in kvm_spapr_tce_attach_iommu_group() they are not sufficient to prevent the UAF, because `stt` is used outside the locked regions. With an artifcial delay after the fdput() and a userspace program which triggers the race, KASAN detects the UAF: BUG: KASAN: slab-use-after-free in kvm_spapr_tce_attach_iommu_group+0x298/0x720 [kvm] Read of size 4 at addr c000200027552c30 by task kvm-vfio/2505 CPU: 54 PID: 2505 Comm: kvm-vfio Not tainted 6.10.0-rc3-next-20240612-dirty #1 Hardware name: 8335-GTH POWER9 0x4e1202 opal:skiboot-v6.5.3-35-g1851b2a06 PowerNV Call Trace: dump_stack_lvl+0xb4/0x108 (unreliable) print_report+0x2b4/0x6ec kasan_report+0x118/0x2b0 __asan_load4+0xb8/0xd0 kvm_spapr_tce_attach_iommu_group+0x298/0x720 [kvm] kvm_vfio_set_attr+0x524/0xac0 [kvm] kvm_device_ioctl+0x144/0x240 [kvm] sys_ioctl+0x62c/0x1810 system_call_exception+0x190/0x440 system_call_vectored_common+0x15c/0x2ec ... Freed by task 0: ... kfree+0xec/0x3e0 release_spapr_tce_table+0xd4/0x11c [kvm] rcu_core+0x568/0x16a0 handle_softirqs+0x23c/0x920 do_softirq_own_stack+0x6c/0x90 do_softirq_own_stack+0x58/0x90 __irq_exit_rcu+0x218/0x2d0 irq_exit+0x30/0x80 arch_local_irq_restore+0x128/0x230 arch_local_irq_enable+0x1c/0x30 cpuidle_enter_state+0x134/0x5cc cpuidle_enter+0x6c/0xb0 call_cpuidle+0x7c/0x100 do_idle+0x394/0x410 cpu_startup_entry+0x60/0x70 start_secondary+0x3fc/0x410 start_secondary_prolog+0x10/0x14 Fix it by delaying the fdput() until `stt` is no longer in use, which is effectively the entire function. To keep the patch minimal add a call to fdput() at each of the existing return paths. Future work can convert the function to goto or __cleanup style cleanup. With the fix in place the test case no longer triggers the UAF. Reported-by: Al Viro Closes: https://lore.kernel.org/all/20240610024437.GA1464458@ZenIV/ Signed-off-by: Michael Ellerman Link: https://msgid.link/20240614122910.3499489-1-mpe@ellerman.id.au --- arch/powerpc/kvm/book3s_64_vio.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index b569ebaa590e2..3ff3de9a52acf 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -130,14 +130,16 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, } rcu_read_unlock(); - fdput(f); - - if (!found) + if (!found) { + fdput(f); return -EINVAL; + } table_group = iommu_group_get_iommudata(grp); - if (WARN_ON(!table_group)) + if (WARN_ON(!table_group)) { + fdput(f); return -EFAULT; + } for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { struct iommu_table *tbltmp = table_group->tables[i]; @@ -158,8 +160,10 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, break; } } - if (!tbl) + if (!tbl) { + fdput(f); return -EINVAL; + } rcu_read_lock(); list_for_each_entry_rcu(stit, &stt->iommu_tables, next) { @@ -170,6 +174,7 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, /* stit is being destroyed */ iommu_tce_table_put(tbl); rcu_read_unlock(); + fdput(f); return -ENOTTY; } /* @@ -177,6 +182,7 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, * its KVM reference counter and can return. */ rcu_read_unlock(); + fdput(f); return 0; } rcu_read_unlock(); @@ -184,6 +190,7 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, stit = kzalloc(sizeof(*stit), GFP_KERNEL); if (!stit) { iommu_tce_table_put(tbl); + fdput(f); return -ENOMEM; } @@ -192,6 +199,7 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, list_add_rcu(&stit->next, &stt->iommu_tables); + fdput(f); return 0; } From a5d400b6439ac734a5c0dbb641e26a38736abc17 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 29 May 2024 00:48:54 -0300 Subject: [PATCH 125/272] arm64: dts: imx93-11x11-evk: Remove the 'no-sdio' property The usdhc2 port is connected to the microSD slot. The presence of the 'no-sdio' property prevents Wifi SDIO cards, such as CMP9010-X-EVB [1] to be detected. Remove the 'no-sdio' property so that SDIO cards could also work. [1] https://www.nxp.com/products/wireless-connectivity/wi-fi-plus-bluetooth-plus-802-15-4/cmp9010-x-evb-iw416-usd-interface-evaluation-board:CMP9010-X-EVB Fixes: e37907bd8294 ("arm64: dts: freescale: add i.MX93 11x11 EVK basic support") Signed-off-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx93-11x11-evk.dts | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx93-11x11-evk.dts b/arch/arm64/boot/dts/freescale/imx93-11x11-evk.dts index d400d85f42a92..bd98eff4d685f 100644 --- a/arch/arm64/boot/dts/freescale/imx93-11x11-evk.dts +++ b/arch/arm64/boot/dts/freescale/imx93-11x11-evk.dts @@ -296,7 +296,6 @@ vmmc-supply = <®_usdhc2_vmmc>; bus-width = <4>; status = "okay"; - no-sdio; no-mmc; }; From 8043832e2a123fd9372007a29192f2f3ba328cd6 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Fri, 14 Jun 2024 11:05:43 +0300 Subject: [PATCH 126/272] memblock: use numa_valid_node() helper to check for invalid node ID Introduce numa_valid_node(nid) that verifies that nid is a valid node ID and use that instead of comparing nid parameter with either NUMA_NO_NODE or MAX_NUMNODES. This makes the checks for valid node IDs consistent and more robust and allows to get rid of multiple WARNings. Suggested-by: Linus Torvalds Signed-off-by: Mike Rapoport (IBM) --- include/linux/numa.h | 5 +++++ mm/memblock.c | 28 +++++++--------------------- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/include/linux/numa.h b/include/linux/numa.h index 1d43371fafd2f..eb19503604fe3 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -15,6 +15,11 @@ #define NUMA_NO_NODE (-1) #define NUMA_NO_MEMBLK (-1) +static inline bool numa_valid_node(int nid) +{ + return nid >= 0 && nid < MAX_NUMNODES; +} + /* optionally keep NUMA memory info available post init */ #ifdef CONFIG_NUMA_KEEP_MEMINFO #define __initdata_or_meminfo diff --git a/mm/memblock.c b/mm/memblock.c index 08e9806b1cf91..e81fb68f7f888 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -754,7 +754,7 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt /* calculate lose page */ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { - if (nid == NUMA_NO_NODE) + if (!numa_valid_node(nid)) nr_pages += end_pfn - start_pfn; } @@ -1061,7 +1061,7 @@ static bool should_skip_region(struct memblock_type *type, return false; /* only memory regions are associated with nodes, check it */ - if (nid != NUMA_NO_NODE && nid != m_nid) + if (numa_valid_node(nid) && nid != m_nid) return true; /* skip hotpluggable memory regions if needed */ @@ -1118,10 +1118,6 @@ void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags, int idx_a = *idx & 0xffffffff; int idx_b = *idx >> 32; - if (WARN_ONCE(nid == MAX_NUMNODES, - "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - for (; idx_a < type_a->cnt; idx_a++) { struct memblock_region *m = &type_a->regions[idx_a]; @@ -1215,9 +1211,6 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, int idx_a = *idx & 0xffffffff; int idx_b = *idx >> 32; - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - if (*idx == (u64)ULLONG_MAX) { idx_a = type_a->cnt - 1; if (type_b != NULL) @@ -1303,7 +1296,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) continue; - if (nid == MAX_NUMNODES || nid == r_nid) + if (!numa_valid_node(nid) || nid == r_nid) break; } if (*idx >= type->cnt) { @@ -1339,10 +1332,6 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, int start_rgn, end_rgn; int i, ret; - if (WARN_ONCE(nid == MAX_NUMNODES, - "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); if (ret) return ret; @@ -1452,9 +1441,6 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, enum memblock_flags flags = choose_memblock_flags(); phys_addr_t found; - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - if (!align) { /* Can't use WARNs this early in boot on powerpc */ dump_stack(); @@ -1467,7 +1453,7 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, if (found && !memblock_reserve(found, size)) goto done; - if (nid != NUMA_NO_NODE && !exact_nid) { + if (numa_valid_node(nid) && !exact_nid) { found = memblock_find_in_range_node(size, align, start, end, NUMA_NO_NODE, flags); @@ -1987,7 +1973,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) end = base + size - 1; flags = rgn->flags; #ifdef CONFIG_NUMA - if (memblock_get_region_node(rgn) != MAX_NUMNODES) + if (numa_valid_node(memblock_get_region_node(rgn))) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif @@ -2181,7 +2167,7 @@ static void __init memmap_init_reserved_pages(void) start = region->base; end = start + region->size; - if (nid == NUMA_NO_NODE || nid >= MAX_NUMNODES) + if (!numa_valid_node(nid)) nid = early_pfn_to_nid(PFN_DOWN(start)); reserve_bootmem_region(start, end, nid); @@ -2272,7 +2258,7 @@ static int memblock_debug_show(struct seq_file *m, void *private) seq_printf(m, "%4d: ", i); seq_printf(m, "%pa..%pa ", ®->base, &end); - if (nid != MAX_NUMNODES) + if (numa_valid_node(nid)) seq_printf(m, "%4d ", nid); else seq_printf(m, "%4c ", 'x'); From b1fd0d1285b1eae8b99af36fb26ed2512b809af6 Mon Sep 17 00:00:00 2001 From: Ajrat Makhmutov Date: Sat, 15 Jun 2024 15:54:57 +0300 Subject: [PATCH 127/272] ALSA: hda/realtek: Enable headset mic on IdeaPad 330-17IKB 81DM Headset microphone do not work out of the box with this laptop. This quirk fixes it. Zihao Wang specified the wrong subsystem id in his patch. Link: https://lore.kernel.org/all/20220424084120.74125-1-wzhd@ustc.edu/ Fixes: 3b79954fd00d ("ALSA: hda/realtek: Add quirk for Yoga Duet 7 13ITL6 speakers") Signed-off-by: Ajrat Makhmutov Link: https://lore.kernel.org/r/20240615125457.167844-1-rauty@altlinux.org Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 79736c8782a31..a7594d46055d7 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10506,7 +10506,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3813, "Legion 7i 15IMHG05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3818, "Lenovo C940 / Yoga Duet 7", ALC298_FIXUP_LENOVO_C940_DUET7), SND_PCI_QUIRK(0x17aa, 0x3819, "Lenovo 13s Gen2 ITL", ALC287_FIXUP_13S_GEN2_SPEAKERS), - SND_PCI_QUIRK(0x17aa, 0x3820, "Yoga Duet 7 13ITL6", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), + SND_PCI_QUIRK(0x17aa, 0x3820, "IdeaPad 330-17IKB 81DM", ALC269_FIXUP_ASPIRE_HEADSET_MIC), SND_PCI_QUIRK(0x17aa, 0x3824, "Legion Y9000X 2020", ALC285_FIXUP_LEGION_Y9000X_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3827, "Ideapad S740", ALC285_FIXUP_IDEAPAD_S740_COEF), SND_PCI_QUIRK(0x17aa, 0x3834, "Lenovo IdeaPad Slim 9i 14ITL5", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), From 1a49509885cd984b6f63c91da612e258b8a89fc8 Mon Sep 17 00:00:00 2001 From: Gergely Meszaros Date: Sun, 16 Jun 2024 10:52:33 +0200 Subject: [PATCH 128/272] ALSA: hda/realtek: Add quirk for Lenovo Yoga Pro 7 14ARP8 Similarly to other Lenovo laptops these also have a dual speaker setup with a shared amplifier. The model also seems to have a conflicting PCI SSID with the codec SSID for the Legion Y9000X 2022 IAH7. Only tested on the Yoga Pro 7, as I don't have access to the other laptop. Signed-off-by: Gergely Meszaros Link: https://lore.kernel.org/r/20240616085233.16922-1-meszaros.gergely97@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index a7594d46055d7..fd337fdd30f69 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -7520,6 +7520,7 @@ enum { ALC285_FIXUP_ASUS_GU605_SPI_SPEAKER2_TO_DAC1, ALC287_FIXUP_LENOVO_THKPAD_WH_ALC1318, ALC256_FIXUP_CHROME_BOOK, + ALC287_FIXUP_LENOVO_14ARP8_LEGION_IAH7, }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -7559,6 +7560,21 @@ static void alc287_fixup_lenovo_14irp8_duetitl(struct hda_codec *codec, __snd_hda_apply_fixup(codec, id, action, 0); } +/* Similar to above the Lenovo Yoga Pro 7 14ARP8 PCI SSID matches the codec SSID of the + Legion Y9000X 2022 IAH7.*/ +static void alc287_fixup_lenovo_14arp8_legion_iah7(struct hda_codec *codec, + const struct hda_fixup *fix, + int action) +{ + int id; + + if (codec->core.subsystem_id == 0x17aa386e) + id = ALC287_FIXUP_CS35L41_I2C_2; /* Legion Y9000X 2022 IAH7 */ + else + id = ALC285_FIXUP_SPEAKER2_TO_DAC1; /* Yoga Pro 7 14ARP8 */ + __snd_hda_apply_fixup(codec, id, action, 0); +} + /* Another hilarious PCI SSID conflict with Lenovo Legion Pro 7 16ARX8H (with * TAS2781 codec) and Legion 7i 16IAX7 (with CS35L41 codec); * we apply a corresponding fixup depending on the codec SSID instead @@ -9658,6 +9674,10 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK, }, + [ALC287_FIXUP_LENOVO_14ARP8_LEGION_IAH7] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc287_fixup_lenovo_14arp8_legion_iah7, + }, [ALC287_FIXUP_YOGA9_14IMH9_BASS_SPK_PIN] = { .type = HDA_FIXUP_FUNC, .v.func = alc287_fixup_yoga9_14iap7_bass_spk_pin, @@ -10520,7 +10540,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3865, "Lenovo 13X", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x3866, "Lenovo 13X", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x3869, "Lenovo Yoga7 14IAL7", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), - SND_PCI_QUIRK(0x17aa, 0x386e, "Legion Y9000X 2022 IAH7", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x17aa, 0x386e, "Legion Y9000X 2022 IAH7 / Yoga Pro 7 14ARP8", ALC287_FIXUP_LENOVO_14ARP8_LEGION_IAH7), SND_PCI_QUIRK(0x17aa, 0x386f, "Legion Pro 7/7i", ALC287_FIXUP_LENOVO_LEGION_7), SND_PCI_QUIRK(0x17aa, 0x3870, "Lenovo Yoga 7 14ARB7", ALC287_FIXUP_YOGA7_14ARB7_I2C), SND_PCI_QUIRK(0x17aa, 0x3877, "Lenovo Legion 7 Slim 16ARHA7", ALC287_FIXUP_CS35L41_I2C_2), From 67cc6125fb39902169707cb6277f010e56d4a40a Mon Sep 17 00:00:00 2001 From: Max Krummenacher Date: Mon, 3 Jun 2024 16:00:45 +0200 Subject: [PATCH 129/272] arm64: dts: freescale: imx8mm-verdin: enable hysteresis on slow input pin SODIMM 17 can be used as an edge triggered interrupt supplied from an off board source. Enable hysteresis on the pinmuxing to increase immunity against noise on the signal. Fixes: 60f01b5b5c7d ("arm64: dts: imx8mm-verdin: update iomux configuration") Signed-off-by: Max Krummenacher Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi index 0d9abca588218..98544741ce176 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi @@ -936,7 +936,7 @@ /* Verdin GPIO_9_DSI (pulled-up as active-low) */ pinctrl_gpio_9_dsi: gpio9dsigrp { fsl,pins = - ; /* SODIMM 17 */ + ; /* SODIMM 17 */ }; /* Verdin GPIO_10_DSI (pulled-up as active-low) */ From dfd239a039b3581ca25f932e66b6e2c2bf77c798 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Fri, 14 Jun 2024 11:06:32 -0400 Subject: [PATCH 130/272] arm64: dts: imx8qm-mek: fix gpio number for reg_usdhc2_vmmc The gpio in "reg_usdhc2_vmmc" should be 7 instead of 19. Cc: stable@vger.kernel.org Fixes: 307fd14d4b14 ("arm64: dts: imx: add imx8qm mek support") Reviewed-by: Peng Fan Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8qm-mek.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8qm-mek.dts b/arch/arm64/boot/dts/freescale/imx8qm-mek.dts index 5c6b39c6933fc..6e05361c1ffb2 100644 --- a/arch/arm64/boot/dts/freescale/imx8qm-mek.dts +++ b/arch/arm64/boot/dts/freescale/imx8qm-mek.dts @@ -36,7 +36,7 @@ regulator-name = "SD1_SPWR"; regulator-min-microvolt = <3000000>; regulator-max-microvolt = <3000000>; - gpio = <&lsio_gpio4 19 GPIO_ACTIVE_HIGH>; + gpio = <&lsio_gpio4 7 GPIO_ACTIVE_HIGH>; enable-active-high; }; From 348a1983cf4cf5099fc398438a968443af4c9f65 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 13 Jun 2024 08:51:48 +1000 Subject: [PATCH 131/272] xfs: fix unlink vs cluster buffer instantiation race Luis has been reporting an assert failure when freeing an inode cluster during inode inactivation for a while. The assert looks like: XFS: Assertion failed: bp->b_flags & XBF_DONE, file: fs/xfs/xfs_trans_buf.c, line: 241 ------------[ cut here ]------------ kernel BUG at fs/xfs/xfs_message.c:102! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI CPU: 4 PID: 73 Comm: kworker/4:1 Not tainted 6.10.0-rc1 #4 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 Workqueue: xfs-inodegc/loop5 xfs_inodegc_worker [xfs] RIP: 0010:assfail (fs/xfs/xfs_message.c:102) xfs RSP: 0018:ffff88810188f7f0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff88816e748250 RCX: 1ffffffff844b0e7 RDX: 0000000000000004 RSI: ffff88810188f558 RDI: ffffffffc2431fa0 RBP: 1ffff11020311f01 R08: 0000000042431f9f R09: ffffed1020311e9b R10: ffff88810188f4df R11: ffffffffac725d70 R12: ffff88817a3f4000 R13: ffff88812182f000 R14: ffff88810188f998 R15: ffffffffc2423f80 FS: 0000000000000000(0000) GS:ffff8881c8400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055fe9d0f109c CR3: 000000014426c002 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: xfs_trans_read_buf_map (fs/xfs/xfs_trans_buf.c:241 (discriminator 1)) xfs xfs_imap_to_bp (fs/xfs/xfs_trans.h:210 fs/xfs/libxfs/xfs_inode_buf.c:138) xfs xfs_inode_item_precommit (fs/xfs/xfs_inode_item.c:145) xfs xfs_trans_run_precommits (fs/xfs/xfs_trans.c:931) xfs __xfs_trans_commit (fs/xfs/xfs_trans.c:966) xfs xfs_inactive_ifree (fs/xfs/xfs_inode.c:1811) xfs xfs_inactive (fs/xfs/xfs_inode.c:2013) xfs xfs_inodegc_worker (fs/xfs/xfs_icache.c:1841 fs/xfs/xfs_icache.c:1886) xfs process_one_work (kernel/workqueue.c:3231) worker_thread (kernel/workqueue.c:3306 (discriminator 2) kernel/workqueue.c:3393 (discriminator 2)) kthread (kernel/kthread.c:389) ret_from_fork (arch/x86/kernel/process.c:147) ret_from_fork_asm (arch/x86/entry/entry_64.S:257) And occurs when the the inode precommit handlers is attempt to look up the inode cluster buffer to attach the inode for writeback. The trail of logic that I can reconstruct is as follows. 1. the inode is clean when inodegc runs, so it is not attached to a cluster buffer when precommit runs. 2. #1 implies the inode cluster buffer may be clean and not pinned by dirty inodes when inodegc runs. 3. #2 implies that the inode cluster buffer can be reclaimed by memory pressure at any time. 4. The assert failure implies that the cluster buffer was attached to the transaction, but not marked done. It had been accessed earlier in the transaction, but not marked done. 5. #4 implies the cluster buffer has been invalidated (i.e. marked stale). 6. #5 implies that the inode cluster buffer was instantiated uninitialised in the transaction in xfs_ifree_cluster(), which only instantiates the buffers to invalidate them and never marks them as done. Given factors 1-3, this issue is highly dependent on timing and environmental factors. Hence the issue can be very difficult to reproduce in some situations, but highly reliable in others. Luis has an environment where it can be reproduced easily by g/531 but, OTOH, I've reproduced it only once in ~2000 cycles of g/531. I think the fix is to have xfs_ifree_cluster() set the XBF_DONE flag on the cluster buffers, even though they may not be initialised. The reasons why I think this is safe are: 1. A buffer cache lookup hit on a XBF_STALE buffer will clear the XBF_DONE flag. Hence all future users of the buffer know they have to re-initialise the contents before use and mark it done themselves. 2. xfs_trans_binval() sets the XFS_BLI_STALE flag, which means the buffer remains locked until the journal commit completes and the buffer is unpinned. Hence once marked XBF_STALE/XFS_BLI_STALE by xfs_ifree_cluster(), the only context that can access the freed buffer is the currently running transaction. 3. #2 implies that future buffer lookups in the currently running transaction will hit the transaction match code and not the buffer cache. Hence XBF_STALE and XFS_BLI_STALE will not be cleared unless the transaction initialises and logs the buffer with valid contents again. At which point, the buffer will be marked marked XBF_DONE again, so having XBF_DONE already set on the stale buffer is a moot point. 4. #2 also implies that any concurrent access to that cluster buffer will block waiting on the buffer lock until the inode cluster has been fully freed and is no longer an active inode cluster buffer. 5. #4 + #1 means that any future user of the disk range of that buffer will always see the range of disk blocks covered by the cluster buffer as not done, and hence must initialise the contents themselves. 6. Setting XBF_DONE in xfs_ifree_cluster() then means the unlinked inode precommit code will see a XBF_DONE buffer from the transaction match as it expects. It can then attach the stale but newly dirtied inode to the stale but newly dirtied cluster buffer without unexpected failures. The stale buffer will then sail through the journal and do the right thing with the attached stale inode during unpin. Hence the fix is just one line of extra code. The explanation of why we have to set XBF_DONE in xfs_ifree_cluster, OTOH, is long and complex.... Fixes: 82842fee6e59 ("xfs: fix AGF vs inode cluster buffer deadlock") Signed-off-by: Dave Chinner Tested-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_inode.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 58fb7a5062e1e..f36091e1e7f50 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2548,11 +2548,26 @@ xfs_ifree_cluster( * This buffer may not have been correctly initialised as we * didn't read it from disk. That's not important because we are * only using to mark the buffer as stale in the log, and to - * attach stale cached inodes on it. That means it will never be - * dispatched for IO. If it is, we want to know about it, and we - * want it to fail. We can acheive this by adding a write - * verifier to the buffer. + * attach stale cached inodes on it. + * + * For the inode that triggered the cluster freeing, this + * attachment may occur in xfs_inode_item_precommit() after we + * have marked this buffer stale. If this buffer was not in + * memory before xfs_ifree_cluster() started, it will not be + * marked XBF_DONE and this will cause problems later in + * xfs_inode_item_precommit() when we trip over a (stale, !done) + * buffer to attached to the transaction. + * + * Hence we have to mark the buffer as XFS_DONE here. This is + * safe because we are also marking the buffer as XBF_STALE and + * XFS_BLI_STALE. That means it will never be dispatched for + * IO and it won't be unlocked until the cluster freeing has + * been committed to the journal and the buffer unpinned. If it + * is written, we want to know about it, and we want it to + * fail. We can acheive this by adding a write verifier to the + * buffer. */ + bp->b_flags |= XBF_DONE; bp->b_ops = &xfs_inode_buf_ops; /* From 49cc17967be95d64606d5684416ee51eec35e84a Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 14 Jun 2024 17:23:11 +0300 Subject: [PATCH 132/272] drm/i915/mso: using joiner is not possible with eDP MSO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's not possible to use the joiner at the same time with eDP MSO. When a panel needs MSO, it's not optional, so MSO trumps joiner. v3: Only change intel_dp_has_joiner(), leave debugfs alone (Ville) Fixes: bc71194e8897 ("drm/i915/edp: enable eDP MSO during link training") Cc: # v5.13+ Cc: Ville Syrjala Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1668 Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240614142311.589089-1-jani.nikula@intel.com Signed-off-by: Jani Nikula (cherry picked from commit 8b5a92ca24eb96bb71e2a55e352687487d87687f) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_dp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index e05e25cd4a940..5b3b6ae1e3d71 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -442,6 +442,10 @@ bool intel_dp_has_bigjoiner(struct intel_dp *intel_dp) struct intel_encoder *encoder = &intel_dig_port->base; struct drm_i915_private *dev_priv = to_i915(encoder->base.dev); + /* eDP MSO is not compatible with joiner */ + if (intel_dp->mso_link_count) + return false; + return DISPLAY_VER(dev_priv) >= 12 || (DISPLAY_VER(dev_priv) == 11 && encoder->port != PORT_A); From 8c4d6945fe5bd04ff847c3c788abd34ca354ecee Mon Sep 17 00:00:00 2001 From: Alexey Makhalov Date: Sat, 15 Jun 2024 18:25:10 -0700 Subject: [PATCH 133/272] drm/vmwgfx: Fix missing HYPERVISOR_GUEST dependency VMWARE_HYPERCALL alternative will not work as intended without VMware guest code initialization. [ bp: note that this doesn't reproduce with newer gccs so it must be something gcc-9-specific. ] Closes: https://lore.kernel.org/oe-kbuild-all/202406152104.FxakP1MB-lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: Alexey Makhalov Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240616012511.198243-1-alexey.makhalov@broadcom.com --- drivers/gpu/drm/vmwgfx/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vmwgfx/Kconfig b/drivers/gpu/drm/vmwgfx/Kconfig index faddae3d6ac2e..6f1ac940cbae7 100644 --- a/drivers/gpu/drm/vmwgfx/Kconfig +++ b/drivers/gpu/drm/vmwgfx/Kconfig @@ -2,7 +2,7 @@ config DRM_VMWGFX tristate "DRM driver for VMware Virtual GPU" depends on DRM && PCI && MMU - depends on X86 || ARM64 + depends on (X86 && HYPERVISOR_GUEST) || ARM64 select DRM_TTM select DRM_TTM_HELPER select MAPPING_DIRTY_HELPERS From 0b9130247f3b6a1122478471ff0e014ea96bb735 Mon Sep 17 00:00:00 2001 From: Gavrilov Ilia Date: Thu, 13 Jun 2024 08:23:00 +0000 Subject: [PATCH 134/272] netrom: Fix a memory leak in nr_heartbeat_expiry() syzbot reported a memory leak in nr_create() [0]. Commit 409db27e3a2e ("netrom: Fix use-after-free of a listening socket.") added sock_hold() to the nr_heartbeat_expiry() function, where a) a socket has a SOCK_DESTROY flag or b) a listening socket has a SOCK_DEAD flag. But in the case "a," when the SOCK_DESTROY flag is set, the file descriptor has already been closed and the nr_release() function has been called. So it makes no sense to hold the reference count because no one will call another nr_destroy_socket() and put it as in the case "b." nr_connect nr_establish_data_link nr_start_heartbeat nr_release switch (nr->state) case NR_STATE_3 nr->state = NR_STATE_2 sock_set_flag(sk, SOCK_DESTROY); nr_rx_frame nr_process_rx_frame switch (nr->state) case NR_STATE_2 nr_state2_machine() nr_disconnect() nr_sk(sk)->state = NR_STATE_0 sock_set_flag(sk, SOCK_DEAD) nr_heartbeat_expiry switch (nr->state) case NR_STATE_0 if (sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) sock_hold() // ( !!! ) nr_destroy_socket() To fix the memory leak, let's call sock_hold() only for a listening socket. Found by InfoTeCS on behalf of Linux Verification Center (linuxtesting.org) with Syzkaller. [0]: https://syzkaller.appspot.com/bug?extid=d327a1f3b12e1e206c16 Reported-by: syzbot+d327a1f3b12e1e206c16@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d327a1f3b12e1e206c16 Fixes: 409db27e3a2e ("netrom: Fix use-after-free of a listening socket.") Signed-off-by: Gavrilov Ilia Signed-off-by: David S. Miller --- net/netrom/nr_timer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index 4e7c968cde2dc..5e3ca068f04e0 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -121,7 +121,8 @@ static void nr_heartbeat_expiry(struct timer_list *t) is accepted() it isn't 'dead' so doesn't get removed. */ if (sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { - sock_hold(sk); + if (sk->sk_state == TCP_LISTEN) + sock_hold(sk); bh_unlock_sock(sk); nr_destroy_socket(sk); goto out; From 8e948c365d9c10b685d1deb946bd833d6a9b43e0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 17 Jun 2024 07:54:08 -0400 Subject: [PATCH 135/272] nfsd: fix oops when reading pool_stats before server is started Sourbh reported an oops that is triggerable by trying to read the pool_stats procfile before nfsd had been started. Move the check for a NULL serv in svc_pool_stats_start above the mutex acquisition, and fix the stop routine not to unlock the mutex if there is no serv yet. Fixes: 7b207ccd9833 ("svc: don't hold reference for poolstats, only mutex.") Reported-by: Sourabh Jain Signed-off-by: Jeff Layton Tested-by: Sourabh Jain Signed-off-by: Chuck Lever --- net/sunrpc/svc_xprt.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index dd86d7f1e97e9..49a3bea33f9d5 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1421,12 +1421,13 @@ static void *svc_pool_stats_start(struct seq_file *m, loff_t *pos) dprintk("svc_pool_stats_start, *pidx=%u\n", pidx); + if (!si->serv) + return NULL; + mutex_lock(si->mutex); if (!pidx) return SEQ_START_TOKEN; - if (!si->serv) - return NULL; return pidx > si->serv->sv_nrpools ? NULL : &si->serv->sv_pools[pidx - 1]; } @@ -1458,7 +1459,8 @@ static void svc_pool_stats_stop(struct seq_file *m, void *p) { struct svc_info *si = m->private; - mutex_unlock(si->mutex); + if (si->serv) + mutex_unlock(si->mutex); } static int svc_pool_stats_show(struct seq_file *m, void *p) From da2c8fef130ec7197e2f91c7ed70a8c5bede4bea Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 17 Jun 2024 16:26:26 +0200 Subject: [PATCH 136/272] NFSD: grab nfsd_mutex in nfsd_nl_rpc_status_get_dumpit() Grab nfsd_mutex lock in nfsd_nl_rpc_status_get_dumpit routine and remove nfsd_nl_rpc_status_get_start() and nfsd_nl_rpc_status_get_done(). This patch fix the syzbot log reported below: INFO: task syz-executor.1:17770 blocked for more than 143 seconds. Not tainted 6.10.0-rc3-syzkaller-00022-gcea2a26553ac #0 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:syz-executor.1 state:D stack:23800 pid:17770 tgid:17767 ppid:11381 flags:0x00000006 Call Trace: context_switch kernel/sched/core.c:5408 [inline] __schedule+0x17e8/0x4a20 kernel/sched/core.c:6745 __schedule_loop kernel/sched/core.c:6822 [inline] schedule+0x14b/0x320 kernel/sched/core.c:6837 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:6894 __mutex_lock_common kernel/locking/mutex.c:684 [inline] __mutex_lock+0x6a4/0xd70 kernel/locking/mutex.c:752 nfsd_nl_listener_get_doit+0x115/0x5d0 fs/nfsd/nfsctl.c:2124 genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0xb16/0xec0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x1e5/0x430 net/netlink/af_netlink.c:2564 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0x7ec/0x980 net/netlink/af_netlink.c:1361 netlink_sendmsg+0x8db/0xcb0 net/netlink/af_netlink.c:1905 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x223/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2585 ___sys_sendmsg net/socket.c:2639 [inline] __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2668 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f24ed27cea9 RSP: 002b:00007f24ee0080c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f24ed3b3f80 RCX: 00007f24ed27cea9 RDX: 0000000000000000 RSI: 0000000020000100 RDI: 0000000000000005 RBP: 00007f24ed2ebff4 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 Fixes: 1bd773b4f0c9 ("nfsd: hold nfsd_mutex across entire netlink operation") Fixes: bd9d6a3efa97 ("NFSD: add rpc_status netlink support") Signed-off-by: Lorenzo Bianconi Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- Documentation/netlink/specs/nfsd.yaml | 2 -- fs/nfsd/netlink.c | 2 -- fs/nfsd/netlink.h | 3 -- fs/nfsd/nfsctl.c | 48 ++++++--------------------- 4 files changed, 11 insertions(+), 44 deletions(-) diff --git a/Documentation/netlink/specs/nfsd.yaml b/Documentation/netlink/specs/nfsd.yaml index d212340971673..6bda7a4673018 100644 --- a/Documentation/netlink/specs/nfsd.yaml +++ b/Documentation/netlink/specs/nfsd.yaml @@ -123,8 +123,6 @@ operations: doc: dump pending nfsd rpc attribute-set: rpc-status dump: - pre: nfsd-nl-rpc-status-get-start - post: nfsd-nl-rpc-status-get-done reply: attributes: - xid diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c index 62d2586d99025..529a75ecf22e8 100644 --- a/fs/nfsd/netlink.c +++ b/fs/nfsd/netlink.c @@ -44,9 +44,7 @@ static const struct nla_policy nfsd_listener_set_nl_policy[NFSD_A_SERVER_SOCK_AD static const struct genl_split_ops nfsd_nl_ops[] = { { .cmd = NFSD_CMD_RPC_STATUS_GET, - .start = nfsd_nl_rpc_status_get_start, .dumpit = nfsd_nl_rpc_status_get_dumpit, - .done = nfsd_nl_rpc_status_get_done, .flags = GENL_CMD_CAP_DUMP, }, { diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h index e3724637d64d5..2e132ef328f8d 100644 --- a/fs/nfsd/netlink.h +++ b/fs/nfsd/netlink.h @@ -15,9 +15,6 @@ extern const struct nla_policy nfsd_sock_nl_policy[NFSD_A_SOCK_TRANSPORT_NAME + 1]; extern const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1]; -int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb); -int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb); - int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 202140df8f82e..533b65057e18e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1460,28 +1460,6 @@ static int create_proc_exports_entry(void) unsigned int nfsd_net_id; -/** - * nfsd_nl_rpc_status_get_start - Prepare rpc_status_get dumpit - * @cb: netlink metadata and command arguments - * - * Return values: - * %0: The rpc_status_get command may proceed - * %-ENODEV: There is no NFSD running in this namespace - */ -int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb) -{ - struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id); - int ret = -ENODEV; - - mutex_lock(&nfsd_mutex); - if (nn->nfsd_serv) - ret = 0; - else - mutex_unlock(&nfsd_mutex); - - return ret; -} - static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, struct netlink_callback *cb, struct nfsd_genl_rqstp *rqstp) @@ -1558,8 +1536,16 @@ static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id); int i, ret, rqstp_index = 0; + struct nfsd_net *nn; + + mutex_lock(&nfsd_mutex); + + nn = net_generic(sock_net(skb->sk), nfsd_net_id); + if (!nn->nfsd_serv) { + ret = -ENODEV; + goto out_unlock; + } rcu_read_lock(); @@ -1636,22 +1622,10 @@ int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, ret = skb->len; out: rcu_read_unlock(); - - return ret; -} - -/** - * nfsd_nl_rpc_status_get_done - rpc_status_get dumpit post-processing - * @cb: netlink metadata and command arguments - * - * Return values: - * %0: Success - */ -int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb) -{ +out_unlock: mutex_unlock(&nfsd_mutex); - return 0; + return ret; } /** From 1ab1a422c0daedbd76f9f25c297eca48986ddea0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 17 Jun 2024 11:13:01 -0700 Subject: [PATCH 137/272] MAINTAINERS: Update entries for Kees Cook Update current email address for Kees Cook in the MAINTAINER file to match the change from commit 4e173c825b19 ("mailmap: update entry for Kees Cook"). Link: https://lore.kernel.org/r/20240617181257.work.206-kees@kernel.org Signed-off-by: Kees Cook --- MAINTAINERS | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8754ac2c259dc..f601a2fd1ebf1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5296,7 +5296,7 @@ F: drivers/infiniband/hw/usnic/ CLANG CONTROL FLOW INTEGRITY SUPPORT M: Sami Tolvanen -M: Kees Cook +M: Kees Cook R: Nathan Chancellor L: llvm@lists.linux.dev S: Supported @@ -8212,7 +8212,7 @@ F: rust/kernel/net/phy.rs EXEC & BINFMT API, ELF R: Eric Biederman -R: Kees Cook +R: Kees Cook L: linux-mm@kvack.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/execve @@ -8613,7 +8613,7 @@ S: Maintained F: drivers/net/ethernet/nvidia/* FORTIFY_SOURCE -M: Kees Cook +M: Kees Cook L: linux-hardening@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening @@ -9103,7 +9103,7 @@ F: include/linux/mfd/gsc.h F: include/linux/platform_data/gsc_hwmon.h GCC PLUGINS -M: Kees Cook +M: Kees Cook L: linux-hardening@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening @@ -9237,7 +9237,7 @@ S: Maintained F: drivers/input/touchscreen/resistive-adc-touch.c GENERIC STRING LIBRARY -M: Kees Cook +M: Kees Cook R: Andy Shevchenko L: linux-hardening@vger.kernel.org S: Supported @@ -11951,7 +11951,7 @@ F: scripts/package/ F: usr/ KERNEL HARDENING (not covered by other areas) -M: Kees Cook +M: Kees Cook R: Gustavo A. R. Silva L: linux-hardening@vger.kernel.org S: Supported @@ -12479,7 +12479,7 @@ F: drivers/scsi/53c700* LEAKING_ADDRESSES M: Tycho Andersen -R: Kees Cook +R: Kees Cook L: linux-hardening@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening @@ -12775,7 +12775,7 @@ F: arch/powerpc/platforms/8xx/ F: arch/powerpc/platforms/83xx/ LINUX KERNEL DUMP TEST MODULE (LKDTM) -M: Kees Cook +M: Kees Cook S: Maintained F: drivers/misc/lkdtm/* F: tools/testing/selftests/lkdtm/* @@ -12905,7 +12905,7 @@ Q: http://patchwork.linuxtv.org/project/linux-media/list/ F: drivers/media/usb/dvb-usb-v2/lmedm04* LOADPIN SECURITY MODULE -M: Kees Cook +M: Kees Cook S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening F: Documentation/admin-guide/LSM/LoadPin.rst @@ -17998,7 +17998,7 @@ F: tools/testing/selftests/proc/ PROC SYSCTL M: Luis Chamberlain -M: Kees Cook +M: Kees Cook M: Joel Granados L: linux-kernel@vger.kernel.org L: linux-fsdevel@vger.kernel.org @@ -18054,7 +18054,7 @@ F: Documentation/devicetree/bindings/net/pse-pd/ F: drivers/net/pse-pd/ PSTORE FILESYSTEM -M: Kees Cook +M: Kees Cook R: Tony Luck R: Guilherme G. Piccoli L: linux-hardening@vger.kernel.org @@ -20060,7 +20060,7 @@ F: drivers/media/cec/platform/seco/seco-cec.c F: drivers/media/cec/platform/seco/seco-cec.h SECURE COMPUTING -M: Kees Cook +M: Kees Cook R: Andy Lutomirski R: Will Drewry S: Supported @@ -22974,7 +22974,7 @@ F: drivers/block/ublk_drv.c F: include/uapi/linux/ublk_cmd.h UBSAN -M: Kees Cook +M: Kees Cook R: Marco Elver R: Andrey Konovalov R: Andrey Ryabinin @@ -24812,7 +24812,7 @@ F: drivers/net/hamradio/yam* F: include/linux/yam.h YAMA SECURITY MODULE -M: Kees Cook +M: Kees Cook S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening F: Documentation/admin-guide/LSM/Yama.rst From a83e1385b780d41307433ddbc86e3c528db031f0 Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Fri, 14 Jun 2024 19:31:49 +0530 Subject: [PATCH 138/272] ACPICA: Revert "ACPICA: avoid Info: mapping multiple BARs. Your kernel is fine." Undo the modifications made in commit d410ee5109a1 ("ACPICA: avoid "Info: mapping multiple BARs. Your kernel is fine.""). The initial purpose of this commit was to stop memory mappings for operation regions from overlapping page boundaries, as it can trigger warnings if different page attributes are present. However, it was found that when this situation arises, mapping continues until the boundary's end, but there is still an attempt to read/write the entire length of the map, leading to a NULL pointer deference. For example, if a four-byte mapping request is made but only one byte is mapped because it hits the current page boundary's end, a four-byte read/write attempt is still made, resulting in a NULL pointer deference. Instead, map the entire length, as the ACPI specification does not mandate that it must be within the same page boundary. It is permissible for it to be mapped across different regions. Link: https://github.com/acpica/acpica/pull/954 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218849 Fixes: d410ee5109a1 ("ACPICA: avoid "Info: mapping multiple BARs. Your kernel is fine."") Co-developed-by: Sanath S Signed-off-by: Sanath S Signed-off-by: Raju Rangoju Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/exregion.c | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/drivers/acpi/acpica/exregion.c b/drivers/acpi/acpica/exregion.c index 8907b8bf42672..c49b9f8de723d 100644 --- a/drivers/acpi/acpica/exregion.c +++ b/drivers/acpi/acpica/exregion.c @@ -44,7 +44,6 @@ acpi_ex_system_memory_space_handler(u32 function, struct acpi_mem_mapping *mm = mem_info->cur_mm; u32 length; acpi_size map_length; - acpi_size page_boundary_map_length; #ifdef ACPI_MISALIGNMENT_NOT_SUPPORTED u32 remainder; #endif @@ -138,26 +137,8 @@ acpi_ex_system_memory_space_handler(u32 function, map_length = (acpi_size) ((mem_info->address + mem_info->length) - address); - /* - * If mapping the entire remaining portion of the region will cross - * a page boundary, just map up to the page boundary, do not cross. - * On some systems, crossing a page boundary while mapping regions - * can cause warnings if the pages have different attributes - * due to resource management. - * - * This has the added benefit of constraining a single mapping to - * one page, which is similar to the original code that used a 4k - * maximum window. - */ - page_boundary_map_length = (acpi_size) - (ACPI_ROUND_UP(address, ACPI_DEFAULT_PAGE_SIZE) - address); - if (page_boundary_map_length == 0) { - page_boundary_map_length = ACPI_DEFAULT_PAGE_SIZE; - } - - if (map_length > page_boundary_map_length) { - map_length = page_boundary_map_length; - } + if (map_length > ACPI_DEFAULT_PAGE_SIZE) + map_length = ACPI_DEFAULT_PAGE_SIZE; /* Create a new mapping starting at the address given */ From c7be64355fccfe7d4727681e32fce07113e40af1 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 12 Jun 2024 12:42:20 +0200 Subject: [PATCH 139/272] ACPI: scan: Ignore camera graph port nodes on all Dell Tiger, Alder and Raptor Lake models Dell laptops with IPU6 camera (the Tiger Lake, Alder Lake and Raptor Lake generations) have broken ACPI MIPI DISCO information (this results from an OEM attempt to make Linux work by supplying it with custom data in the ACPI tables which has never been supported in the mainline). Instead of adding a lot of DMI quirks for this, check for Dell platforms based on the processor generations in question and drop the ACPI graph port nodes, likely to be created with the help of invalid data, on all of them. Fixes: bd721b934323 ("ACPI: scan: Extract CSI-2 connection graph from _CRS") Signed-off-by: Hans de Goede [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/internal.h | 4 ++++ drivers/acpi/mipi-disco-img.c | 28 +++++++++++++++++++--------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 2a0e9fc7b74c1..601b670356e50 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -302,6 +302,10 @@ void acpi_mipi_check_crs_csi2(acpi_handle handle); void acpi_mipi_scan_crs_csi2(void); void acpi_mipi_init_crs_csi2_swnodes(void); void acpi_mipi_crs_csi2_cleanup(void); +#ifdef CONFIG_X86 bool acpi_graph_ignore_port(acpi_handle handle); +#else +static inline bool acpi_graph_ignore_port(acpi_handle handle) { return false; } +#endif #endif /* _ACPI_INTERNAL_H_ */ diff --git a/drivers/acpi/mipi-disco-img.c b/drivers/acpi/mipi-disco-img.c index d05413a0672a9..0ab13751f0dbc 100644 --- a/drivers/acpi/mipi-disco-img.c +++ b/drivers/acpi/mipi-disco-img.c @@ -725,14 +725,20 @@ void acpi_mipi_crs_csi2_cleanup(void) acpi_mipi_del_crs_csi2(csi2); } -static const struct dmi_system_id dmi_ignore_port_nodes[] = { - { - .matches = { - DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "XPS 9315"), - }, - }, - { } +#ifdef CONFIG_X86 +#include +#include + +/* CPU matches for Dell generations with broken ACPI MIPI DISCO info */ +static const struct x86_cpu_id dell_broken_mipi_disco_cpu_gens[] = { + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, NULL), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, NULL), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, NULL), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, NULL), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, NULL), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, NULL), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, NULL), + {} }; static const char *strnext(const char *s1, const char *s2) @@ -761,7 +767,10 @@ bool acpi_graph_ignore_port(acpi_handle handle) static bool dmi_tested, ignore_port; if (!dmi_tested) { - ignore_port = dmi_first_match(dmi_ignore_port_nodes); + if (dmi_name_in_vendors("Dell Inc.") && + x86_match_cpu(dell_broken_mipi_disco_cpu_gens)) + ignore_port = true; + dmi_tested = true; } @@ -794,3 +803,4 @@ bool acpi_graph_ignore_port(acpi_handle handle) kfree(orig_path); return false; } +#endif From 0606c5c4ad05076dba39af055644d83e3f6ba3a4 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 12 Jun 2024 12:48:28 +0200 Subject: [PATCH 140/272] ACPI: mipi-disco-img: Switch to new Intel CPU model defines Switch over to using the new Intel CPU model defines, as the old ones are going away. Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/mipi-disco-img.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/mipi-disco-img.c b/drivers/acpi/mipi-disco-img.c index 0ab13751f0dbc..92b658f92dc0f 100644 --- a/drivers/acpi/mipi-disco-img.c +++ b/drivers/acpi/mipi-disco-img.c @@ -731,13 +731,13 @@ void acpi_mipi_crs_csi2_cleanup(void) /* CPU matches for Dell generations with broken ACPI MIPI DISCO info */ static const struct x86_cpu_id dell_broken_mipi_disco_cpu_gens[] = { - X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, NULL), - X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, NULL), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, NULL), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, NULL), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, NULL), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, NULL), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, NULL), + X86_MATCH_VFM(INTEL_TIGERLAKE, NULL), + X86_MATCH_VFM(INTEL_TIGERLAKE_L, NULL), + X86_MATCH_VFM(INTEL_ALDERLAKE, NULL), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, NULL), + X86_MATCH_VFM(INTEL_RAPTORLAKE, NULL), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, NULL), + X86_MATCH_VFM(INTEL_RAPTORLAKE_S, NULL), {} }; From 14d7c92f8df9c0964ae6f8b813c1b3ac38120825 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 17 Jun 2024 12:57:03 -0700 Subject: [PATCH 141/272] Revert "mm: mmap: allow for the maximum number of bits for randomizing mmap_base by default" This reverts commit 3afb76a66b5559a7b595155803ce23801558a7a9. This was a wrongheaded workaround for an issue that had already been fixed much better by commit 4ef9ad19e176 ("mm: huge_memory: don't force huge page alignment on 32 bit"). Asking users questions at kernel compile time that they can't make sense of is not a viable strategy. And the fact that even the kernel VM maintainers apparently didn't catch that this "fix" is not a fix any more pretty much proves the point that people can't be expected to understand the implications of the question. It may well be the case that we could improve things further, and that __thp_get_unmapped_area() should take the mapping randomization into account even for 64-bit kernels. Maybe we should not be so eager to use THP mappings. But in no case should this be a kernel config option. Cc: Rafael Aquini Cc: Andrew Morton Cc: Jiri Slaby Cc: Suren Baghdasaryan Cc: Matthew Wilcox (Oracle) Signed-off-by: Linus Torvalds --- arch/Kconfig | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 3e2a63772b3db..975dd22a2dbd2 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1046,21 +1046,10 @@ config ARCH_MMAP_RND_BITS_MAX config ARCH_MMAP_RND_BITS_DEFAULT int -config FORCE_MAX_MMAP_RND_BITS - bool "Force maximum number of bits to use for ASLR of mmap base address" - default y if !64BIT - help - ARCH_MMAP_RND_BITS and ARCH_MMAP_RND_COMPAT_BITS represent the number - of bits to use for ASLR and if no custom value is assigned (EXPERT) - then the architecture's lower bound (minimum) value is assumed. - This toggle changes that default assumption to assume the arch upper - bound (maximum) value instead. - config ARCH_MMAP_RND_BITS int "Number of bits to use for ASLR of mmap base address" if EXPERT range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT - default ARCH_MMAP_RND_BITS_MAX if FORCE_MAX_MMAP_RND_BITS default ARCH_MMAP_RND_BITS_MIN depends on HAVE_ARCH_MMAP_RND_BITS help @@ -1095,7 +1084,6 @@ config ARCH_MMAP_RND_COMPAT_BITS int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT - default ARCH_MMAP_RND_COMPAT_BITS_MAX if FORCE_MAX_MMAP_RND_BITS default ARCH_MMAP_RND_COMPAT_BITS_MIN depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS help From 9e046bb111f13461d3f9331e24e974324245140e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Jun 2024 13:06:15 +0000 Subject: [PATCH 142/272] tcp: clear tp->retrans_stamp in tcp_rcv_fastopen_synack() Some applications were reporting ETIMEDOUT errors on apparently good looking flows, according to packet dumps. We were able to root cause the issue to an accidental setting of tp->retrans_stamp in the following scenario: - client sends TFO SYN with data. - server has TFO disabled, ACKs only SYN but not payload. - client receives SYNACK covering only SYN. - tcp_ack() eats SYN and sets tp->retrans_stamp to 0. - tcp_rcv_fastopen_synack() calls tcp_xmit_retransmit_queue() to retransmit TFO payload w/o SYN, sets tp->retrans_stamp to "now", but we are not in any loss recovery state. - TFO payload is ACKed. - we are not in any loss recovery state, and don't see any dupacks, so we don't get to any code path that clears tp->retrans_stamp. - tp->retrans_stamp stays non-zero for the lifetime of the connection. - after first RTO, tcp_clamp_rto_to_user_timeout() clamps second RTO to 1 jiffy due to bogus tp->retrans_stamp. - on clamped RTO with non-zero icsk_retransmits, retransmits_timed_out() sets start_ts from tp->retrans_stamp from TFO payload retransmit hours/days ago, and computes bogus long elapsed time for loss recovery, and suffers ETIMEDOUT early. Fixes: a7abf3cd76e1 ("tcp: consider using standard rtx logic in tcp_rcv_fastopen_synack()") CC: stable@vger.kernel.org Co-developed-by: Neal Cardwell Signed-off-by: Neal Cardwell Co-developed-by: Yuchung Cheng Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240614130615.396837-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9c04a9c8be9df..01d208e0eef31 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6296,6 +6296,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, skb_rbtree_walk_from(data) tcp_mark_skb_lost(sk, data); tcp_xmit_retransmit_queue(sk); + tp->retrans_stamp = 0; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; From e874557fce1b6023efafd523aee0c347bf7f1694 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 14 Jun 2024 19:15:29 +0200 Subject: [PATCH 143/272] selftests: mptcp: userspace_pm: fixed subtest names It is important to have fixed (sub)test names in TAP, because these names are used to identify them. If they are not fixed, tracking cannot be done. Some subtests from the userspace_pm selftest were using random numbers in their names: the client and server address IDs from $RANDOM, and the client port number randomly picked by the kernel when creating the connection. These values have been replaced by 'client' and 'server' words: that's even more helpful than showing random numbers. Note that the addresses IDs are incremented and decremented in the test: +1 or -1 are then displayed in these cases. Not to loose info that can be useful for debugging in case of issues, these random numbers are now displayed at the beginning of the test. Fixes: f589234e1af0 ("selftests: mptcp: userspace_pm: format subtests results in TAP") Cc: stable@vger.kernel.org Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240614-upstream-net-20240614-selftests-mptcp-uspace-pm-fixed-test-names-v1-1-460ad3edb429@kernel.org Signed-off-by: Jakub Kicinski --- .../selftests/net/mptcp/userspace_pm.sh | 46 +++++++++++-------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 9e2981f2d7f5c..9cb05978269d1 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -160,10 +160,12 @@ make_connection() local is_v6=$1 local app_port=$app4_port local connect_addr="10.0.1.1" + local client_addr="10.0.1.2" local listen_addr="0.0.0.0" if [ "$is_v6" = "v6" ] then connect_addr="dead:beef:1::1" + client_addr="dead:beef:1::2" listen_addr="::" app_port=$app6_port else @@ -206,6 +208,7 @@ make_connection() [ "$server_serverside" = 1 ] then test_pass + print_title "Connection info: ${client_addr}:${client_port} -> ${connect_addr}:${app_port}" else test_fail "Expected tokens (c:${client_token} - s:${server_token}) and server (c:${client_serverside} - s:${server_serverside})" mptcp_lib_result_print_all_tap @@ -297,7 +300,7 @@ test_announce() ip netns exec "$ns2"\ ./pm_nl_ctl ann 10.0.2.2 token "$client4_token" id $client_addr_id dev\ ns2eth1 - print_test "ADD_ADDR id:${client_addr_id} 10.0.2.2 (ns2) => ns1, reuse port" + print_test "ADD_ADDR id:client 10.0.2.2 (ns2) => ns1, reuse port" sleep 0.5 verify_announce_event $server_evts $ANNOUNCED $server4_token "10.0.2.2" $client_addr_id \ "$client4_port" @@ -306,7 +309,7 @@ test_announce() :>"$server_evts" ip netns exec "$ns2" ./pm_nl_ctl ann\ dead:beef:2::2 token "$client6_token" id $client_addr_id dev ns2eth1 - print_test "ADD_ADDR6 id:${client_addr_id} dead:beef:2::2 (ns2) => ns1, reuse port" + print_test "ADD_ADDR6 id:client dead:beef:2::2 (ns2) => ns1, reuse port" sleep 0.5 verify_announce_event "$server_evts" "$ANNOUNCED" "$server6_token" "dead:beef:2::2"\ "$client_addr_id" "$client6_port" "v6" @@ -316,7 +319,7 @@ test_announce() client_addr_id=$((client_addr_id+1)) ip netns exec "$ns2" ./pm_nl_ctl ann 10.0.2.2 token "$client4_token" id\ $client_addr_id dev ns2eth1 port $new4_port - print_test "ADD_ADDR id:${client_addr_id} 10.0.2.2 (ns2) => ns1, new port" + print_test "ADD_ADDR id:client+1 10.0.2.2 (ns2) => ns1, new port" sleep 0.5 verify_announce_event "$server_evts" "$ANNOUNCED" "$server4_token" "10.0.2.2"\ "$client_addr_id" "$new4_port" @@ -327,7 +330,7 @@ test_announce() # ADD_ADDR from the server to client machine reusing the subflow port ip netns exec "$ns1" ./pm_nl_ctl ann 10.0.2.1 token "$server4_token" id\ $server_addr_id dev ns1eth2 - print_test "ADD_ADDR id:${server_addr_id} 10.0.2.1 (ns1) => ns2, reuse port" + print_test "ADD_ADDR id:server 10.0.2.1 (ns1) => ns2, reuse port" sleep 0.5 verify_announce_event "$client_evts" "$ANNOUNCED" "$client4_token" "10.0.2.1"\ "$server_addr_id" "$app4_port" @@ -336,7 +339,7 @@ test_announce() :>"$client_evts" ip netns exec "$ns1" ./pm_nl_ctl ann dead:beef:2::1 token "$server6_token" id\ $server_addr_id dev ns1eth2 - print_test "ADD_ADDR6 id:${server_addr_id} dead:beef:2::1 (ns1) => ns2, reuse port" + print_test "ADD_ADDR6 id:server dead:beef:2::1 (ns1) => ns2, reuse port" sleep 0.5 verify_announce_event "$client_evts" "$ANNOUNCED" "$client6_token" "dead:beef:2::1"\ "$server_addr_id" "$app6_port" "v6" @@ -346,7 +349,7 @@ test_announce() server_addr_id=$((server_addr_id+1)) ip netns exec "$ns1" ./pm_nl_ctl ann 10.0.2.1 token "$server4_token" id\ $server_addr_id dev ns1eth2 port $new4_port - print_test "ADD_ADDR id:${server_addr_id} 10.0.2.1 (ns1) => ns2, new port" + print_test "ADD_ADDR id:server+1 10.0.2.1 (ns1) => ns2, new port" sleep 0.5 verify_announce_event "$client_evts" "$ANNOUNCED" "$client4_token" "10.0.2.1"\ "$server_addr_id" "$new4_port" @@ -380,7 +383,7 @@ test_remove() local invalid_token=$(( client4_token - 1 )) ip netns exec "$ns2" ./pm_nl_ctl rem token $invalid_token id\ $client_addr_id > /dev/null 2>&1 - print_test "RM_ADDR id:${client_addr_id} ns2 => ns1, invalid token" + print_test "RM_ADDR id:client ns2 => ns1, invalid token" local type type=$(mptcp_lib_evts_get_info type "$server_evts") if [ "$type" = "" ] @@ -394,7 +397,7 @@ test_remove() local invalid_id=$(( client_addr_id + 1 )) ip netns exec "$ns2" ./pm_nl_ctl rem token "$client4_token" id\ $invalid_id > /dev/null 2>&1 - print_test "RM_ADDR id:${invalid_id} ns2 => ns1, invalid id" + print_test "RM_ADDR id:client+1 ns2 => ns1, invalid id" type=$(mptcp_lib_evts_get_info type "$server_evts") if [ "$type" = "" ] then @@ -407,7 +410,7 @@ test_remove() :>"$server_evts" ip netns exec "$ns2" ./pm_nl_ctl rem token "$client4_token" id\ $client_addr_id - print_test "RM_ADDR id:${client_addr_id} ns2 => ns1" + print_test "RM_ADDR id:client ns2 => ns1" sleep 0.5 verify_remove_event "$server_evts" "$REMOVED" "$server4_token" "$client_addr_id" @@ -416,7 +419,7 @@ test_remove() client_addr_id=$(( client_addr_id - 1 )) ip netns exec "$ns2" ./pm_nl_ctl rem token "$client4_token" id\ $client_addr_id - print_test "RM_ADDR id:${client_addr_id} ns2 => ns1" + print_test "RM_ADDR id:client-1 ns2 => ns1" sleep 0.5 verify_remove_event "$server_evts" "$REMOVED" "$server4_token" "$client_addr_id" @@ -424,7 +427,7 @@ test_remove() :>"$server_evts" ip netns exec "$ns2" ./pm_nl_ctl rem token "$client6_token" id\ $client_addr_id - print_test "RM_ADDR6 id:${client_addr_id} ns2 => ns1" + print_test "RM_ADDR6 id:client-1 ns2 => ns1" sleep 0.5 verify_remove_event "$server_evts" "$REMOVED" "$server6_token" "$client_addr_id" @@ -434,7 +437,7 @@ test_remove() # RM_ADDR from the server to client machine ip netns exec "$ns1" ./pm_nl_ctl rem token "$server4_token" id\ $server_addr_id - print_test "RM_ADDR id:${server_addr_id} ns1 => ns2" + print_test "RM_ADDR id:server ns1 => ns2" sleep 0.5 verify_remove_event "$client_evts" "$REMOVED" "$client4_token" "$server_addr_id" @@ -443,7 +446,7 @@ test_remove() server_addr_id=$(( server_addr_id - 1 )) ip netns exec "$ns1" ./pm_nl_ctl rem token "$server4_token" id\ $server_addr_id - print_test "RM_ADDR id:${server_addr_id} ns1 => ns2" + print_test "RM_ADDR id:server-1 ns1 => ns2" sleep 0.5 verify_remove_event "$client_evts" "$REMOVED" "$client4_token" "$server_addr_id" @@ -451,7 +454,7 @@ test_remove() :>"$client_evts" ip netns exec "$ns1" ./pm_nl_ctl rem token "$server6_token" id\ $server_addr_id - print_test "RM_ADDR6 id:${server_addr_id} ns1 => ns2" + print_test "RM_ADDR6 id:server-1 ns1 => ns2" sleep 0.5 verify_remove_event "$client_evts" "$REMOVED" "$client6_token" "$server_addr_id" } @@ -479,8 +482,14 @@ verify_subflow_events() local locid local remid local info + local e_dport_txt - info="${e_saddr} (${e_from}) => ${e_daddr}:${e_dport} (${e_to})" + # only display the fixed ports + if [ "${e_dport}" -ge "${app4_port}" ] && [ "${e_dport}" -le "${app6_port}" ]; then + e_dport_txt=":${e_dport}" + fi + + info="${e_saddr} (${e_from}) => ${e_daddr}${e_dport_txt} (${e_to})" if [ "$e_type" = "$SUB_ESTABLISHED" ] then @@ -766,7 +775,7 @@ test_subflows_v4_v6_mix() :>"$client_evts" ip netns exec "$ns1" ./pm_nl_ctl ann 10.0.2.1 token "$server6_token" id\ $server_addr_id dev ns1eth2 - print_test "ADD_ADDR4 id:${server_addr_id} 10.0.2.1 (ns1) => ns2, reuse port" + print_test "ADD_ADDR4 id:server 10.0.2.1 (ns1) => ns2, reuse port" sleep 0.5 verify_announce_event "$client_evts" "$ANNOUNCED" "$client6_token" "10.0.2.1"\ "$server_addr_id" "$app6_port" @@ -861,7 +870,7 @@ test_listener() local listener_pid=$! sleep 0.5 - print_test "CREATE_LISTENER 10.0.2.2:$client4_port" + print_test "CREATE_LISTENER 10.0.2.2 (client port)" verify_listener_events $client_evts $LISTENER_CREATED $AF_INET 10.0.2.2 $client4_port # ADD_ADDR from client to server machine reusing the subflow port @@ -878,13 +887,14 @@ test_listener() mptcp_lib_kill_wait $listener_pid sleep 0.5 - print_test "CLOSE_LISTENER 10.0.2.2:$client4_port" + print_test "CLOSE_LISTENER 10.0.2.2 (client port)" verify_listener_events $client_evts $LISTENER_CLOSED $AF_INET 10.0.2.2 $client4_port } print_title "Make connections" make_connection make_connection "v6" +print_title "Will be using address IDs ${client_addr_id} (client) and ${server_addr_id} (server)" test_announce test_remove From 2eab4543a2204092c3a7af81d7d6c506e59a03a6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Jun 2024 08:20:02 +0000 Subject: [PATCH 144/272] ipv6: prevent possible NULL deref in fib6_nh_init() syzbot reminds us that in6_dev_get() can return NULL. fib6_nh_init() ip6_validate_gw( &idev ) ip6_route_check_nh( idev ) *idev = in6_dev_get(dev); // can be NULL Oops: general protection fault, probably for non-canonical address 0xdffffc00000000bc: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x00000000000005e0-0x00000000000005e7] CPU: 0 PID: 11237 Comm: syz-executor.3 Not tainted 6.10.0-rc2-syzkaller-00249-gbe27b8965297 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 06/07/2024 RIP: 0010:fib6_nh_init+0x640/0x2160 net/ipv6/route.c:3606 Code: 00 00 fc ff df 4c 8b 64 24 58 48 8b 44 24 28 4c 8b 74 24 30 48 89 c1 48 89 44 24 28 48 8d 98 e0 05 00 00 48 89 d8 48 c1 e8 03 <42> 0f b6 04 38 84 c0 0f 85 b3 17 00 00 8b 1b 31 ff 89 de e8 b8 8b RSP: 0018:ffffc900032775a0 EFLAGS: 00010202 RAX: 00000000000000bc RBX: 00000000000005e0 RCX: 0000000000000000 RDX: 0000000000000010 RSI: ffffc90003277a54 RDI: ffff88802b3a08d8 RBP: ffffc900032778b0 R08: 00000000000002fc R09: 0000000000000000 R10: 00000000000002fc R11: 0000000000000000 R12: ffff88802b3a08b8 R13: 1ffff9200064eec8 R14: ffffc90003277a00 R15: dffffc0000000000 FS: 00007f940feb06c0(0000) GS:ffff8880b9400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000000245e8000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ip6_route_info_create+0x99e/0x12b0 net/ipv6/route.c:3809 ip6_route_add+0x28/0x160 net/ipv6/route.c:3853 ipv6_route_ioctl+0x588/0x870 net/ipv6/route.c:4483 inet6_ioctl+0x21a/0x280 net/ipv6/af_inet6.c:579 sock_do_ioctl+0x158/0x460 net/socket.c:1222 sock_ioctl+0x629/0x8e0 net/socket.c:1341 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f940f07cea9 Fixes: 428604fb118f ("ipv6: do not set routes if disable_ipv6 has been enabled") Reported-by: syzbot Signed-off-by: Eric Dumazet Acked-by: Lorenzo Bianconi Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240614082002.26407-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 952c2bf117094..28788ffde5854 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3603,7 +3603,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, if (!dev) goto out; - if (idev->cnf.disable_ipv6) { + if (!idev || idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); err = -EACCES; goto out; From b86762dbe19a62e785c189f313cda5b989931f37 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 15 Jun 2024 15:14:54 +0000 Subject: [PATCH 145/272] ipv6: prevent possible NULL dereference in rt6_probe() syzbot caught a NULL dereference in rt6_probe() [1] Bail out if __in6_dev_get() returns NULL. [1] Oops: general protection fault, probably for non-canonical address 0xdffffc00000000cb: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000658-0x000000000000065f] CPU: 1 PID: 22444 Comm: syz-executor.0 Not tainted 6.10.0-rc2-syzkaller-00383-gb8481381d4e2 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/02/2024 RIP: 0010:rt6_probe net/ipv6/route.c:656 [inline] RIP: 0010:find_match+0x8c4/0xf50 net/ipv6/route.c:758 Code: 14 fd f7 48 8b 85 38 ff ff ff 48 c7 45 b0 00 00 00 00 48 8d b8 5c 06 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 19 RSP: 0018:ffffc900034af070 EFLAGS: 00010203 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffc90004521000 RDX: 00000000000000cb RSI: ffffffff8990d0cd RDI: 000000000000065c RBP: ffffc900034af150 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000002 R12: 000000000000000a R13: 1ffff92000695e18 R14: ffff8880244a1d20 R15: 0000000000000000 FS: 00007f4844a5a6c0(0000) GS:ffff8880b9300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b31b27000 CR3: 000000002d42c000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: rt6_nh_find_match+0xfa/0x1a0 net/ipv6/route.c:784 nexthop_for_each_fib6_nh+0x26d/0x4a0 net/ipv4/nexthop.c:1496 __find_rr_leaf+0x6e7/0xe00 net/ipv6/route.c:825 find_rr_leaf net/ipv6/route.c:853 [inline] rt6_select net/ipv6/route.c:897 [inline] fib6_table_lookup+0x57e/0xa30 net/ipv6/route.c:2195 ip6_pol_route+0x1cd/0x1150 net/ipv6/route.c:2231 pol_lookup_func include/net/ip6_fib.h:616 [inline] fib6_rule_lookup+0x386/0x720 net/ipv6/fib6_rules.c:121 ip6_route_output_flags_noref net/ipv6/route.c:2639 [inline] ip6_route_output_flags+0x1d0/0x640 net/ipv6/route.c:2651 ip6_dst_lookup_tail.constprop.0+0x961/0x1760 net/ipv6/ip6_output.c:1147 ip6_dst_lookup_flow+0x99/0x1d0 net/ipv6/ip6_output.c:1250 rawv6_sendmsg+0xdab/0x4340 net/ipv6/raw.c:898 inet_sendmsg+0x119/0x140 net/ipv4/af_inet.c:853 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] sock_write_iter+0x4b8/0x5c0 net/socket.c:1160 new_sync_write fs/read_write.c:497 [inline] vfs_write+0x6b6/0x1140 fs/read_write.c:590 ksys_write+0x1f8/0x260 fs/read_write.c:643 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 52e1635631b3 ("[IPV6]: ROUTE: Add router_probe_interval sysctl.") Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240615151454.166404-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 28788ffde5854..8d72ca0b086d7 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -638,6 +638,8 @@ static void rt6_probe(struct fib6_nh *fib6_nh) rcu_read_lock(); last_probe = READ_ONCE(fib6_nh->last_probe); idev = __in6_dev_get(dev); + if (!idev) + goto out; neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { if (READ_ONCE(neigh->nud_state) & NUD_VALID) From d46401052c2d5614da8efea5788532f0401cb164 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 15 Jun 2024 15:42:31 +0000 Subject: [PATCH 146/272] xfrm6: check ip6_dst_idev() return value in xfrm6_get_saddr() ip6_dst_idev() can return NULL, xfrm6_get_saddr() must act accordingly. syzbot reported: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 1 PID: 12 Comm: kworker/u8:1 Not tainted 6.10.0-rc2-syzkaller-00383-gb8481381d4e2 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/02/2024 Workqueue: wg-kex-wg1 wg_packet_handshake_send_worker RIP: 0010:xfrm6_get_saddr+0x93/0x130 net/ipv6/xfrm6_policy.c:64 Code: df 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 97 00 00 00 4c 8b ab d8 00 00 00 48 b8 00 00 00 00 00 fc ff df 4c 89 ea 48 c1 ea 03 <80> 3c 02 00 0f 85 86 00 00 00 4d 8b 6d 00 e8 ca 13 47 01 48 b8 00 RSP: 0018:ffffc90000117378 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffff88807b079dc0 RCX: ffffffff89a0d6d7 RDX: 0000000000000000 RSI: ffffffff89a0d6e9 RDI: ffff88807b079e98 RBP: ffff88807ad73248 R08: 0000000000000007 R09: fffffffffffff000 R10: ffff88807b079dc0 R11: 0000000000000007 R12: ffffc90000117480 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8880b9300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f4586d00440 CR3: 0000000079042000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: xfrm_get_saddr net/xfrm/xfrm_policy.c:2452 [inline] xfrm_tmpl_resolve_one net/xfrm/xfrm_policy.c:2481 [inline] xfrm_tmpl_resolve+0xa26/0xf10 net/xfrm/xfrm_policy.c:2541 xfrm_resolve_and_create_bundle+0x140/0x2570 net/xfrm/xfrm_policy.c:2835 xfrm_bundle_lookup net/xfrm/xfrm_policy.c:3070 [inline] xfrm_lookup_with_ifid+0x4d1/0x1e60 net/xfrm/xfrm_policy.c:3201 xfrm_lookup net/xfrm/xfrm_policy.c:3298 [inline] xfrm_lookup_route+0x3b/0x200 net/xfrm/xfrm_policy.c:3309 ip6_dst_lookup_flow+0x15c/0x1d0 net/ipv6/ip6_output.c:1256 send6+0x611/0xd20 drivers/net/wireguard/socket.c:139 wg_socket_send_skb_to_peer+0xf9/0x220 drivers/net/wireguard/socket.c:178 wg_socket_send_buffer_to_peer+0x12b/0x190 drivers/net/wireguard/socket.c:200 wg_packet_send_handshake_initiation+0x227/0x360 drivers/net/wireguard/send.c:40 wg_packet_handshake_send_worker+0x1c/0x30 drivers/net/wireguard/send.c:51 process_one_work+0x9fb/0x1b60 kernel/workqueue.c:3231 process_scheduled_works kernel/workqueue.c:3312 [inline] worker_thread+0x6c8/0xf70 kernel/workqueue.c:3393 kthread+0x2c1/0x3a0 kernel/kthread.c:389 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240615154231.234442-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/xfrm6_policy.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index cc885d3aa9e59..2f1ea5f999a25 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -56,12 +56,18 @@ static int xfrm6_get_saddr(struct net *net, int oif, { struct dst_entry *dst; struct net_device *dev; + struct inet6_dev *idev; dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr, mark); if (IS_ERR(dst)) return -EHOSTUNREACH; - dev = ip6_dst_idev(dst)->dev; + idev = ip6_dst_idev(dst); + if (!idev) { + dst_release(dst); + return -EHOSTUNREACH; + } + dev = idev->dev; ipv6_dev_get_saddr(dev_net(dev), dev, &daddr->in6, 0, &saddr->in6); dst_release(dst); return 0; From ff960f9d3edbe08a736b5a224d91a305ccc946b0 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Fri, 14 Jun 2024 21:13:02 +0800 Subject: [PATCH 147/272] netns: Make get_net_ns() handle zero refcount net Syzkaller hit a warning: refcount_t: addition on 0; use-after-free. WARNING: CPU: 3 PID: 7890 at lib/refcount.c:25 refcount_warn_saturate+0xdf/0x1d0 Modules linked in: CPU: 3 PID: 7890 Comm: tun Not tainted 6.10.0-rc3-00100-gcaa4f9578aba-dirty #310 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:refcount_warn_saturate+0xdf/0x1d0 Code: 41 49 04 31 ff 89 de e8 9f 1e cd fe 84 db 75 9c e8 76 26 cd fe c6 05 b6 41 49 04 01 90 48 c7 c7 b8 8e 25 86 e8 d2 05 b5 fe 90 <0f> 0b 90 90 e9 79 ff ff ff e8 53 26 cd fe 0f b6 1 RSP: 0018:ffff8881067b7da0 EFLAGS: 00010286 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff811c72ac RDX: ffff8881026a2140 RSI: ffffffff811c72b5 RDI: 0000000000000001 RBP: ffff8881067b7db0 R08: 0000000000000000 R09: 205b5d3730353139 R10: 0000000000000000 R11: 205d303938375420 R12: ffff8881086500c4 R13: ffff8881086500c4 R14: ffff8881086500b0 R15: ffff888108650040 FS: 00007f5b2961a4c0(0000) GS:ffff88823bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055d7ed36fd18 CR3: 00000001482f6000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? show_regs+0xa3/0xc0 ? __warn+0xa5/0x1c0 ? refcount_warn_saturate+0xdf/0x1d0 ? report_bug+0x1fc/0x2d0 ? refcount_warn_saturate+0xdf/0x1d0 ? handle_bug+0xa1/0x110 ? exc_invalid_op+0x3c/0xb0 ? asm_exc_invalid_op+0x1f/0x30 ? __warn_printk+0xcc/0x140 ? __warn_printk+0xd5/0x140 ? refcount_warn_saturate+0xdf/0x1d0 get_net_ns+0xa4/0xc0 ? __pfx_get_net_ns+0x10/0x10 open_related_ns+0x5a/0x130 __tun_chr_ioctl+0x1616/0x2370 ? __sanitizer_cov_trace_switch+0x58/0xa0 ? __sanitizer_cov_trace_const_cmp2+0x1c/0x30 ? __pfx_tun_chr_ioctl+0x10/0x10 tun_chr_ioctl+0x2f/0x40 __x64_sys_ioctl+0x11b/0x160 x64_sys_call+0x1211/0x20d0 do_syscall_64+0x9e/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5b28f165d7 Code: b3 66 90 48 8b 05 b1 48 2d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 81 48 2d 00 8 RSP: 002b:00007ffc2b59c5e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f5b28f165d7 RDX: 0000000000000000 RSI: 00000000000054e3 RDI: 0000000000000003 RBP: 00007ffc2b59c650 R08: 00007f5b291ed8c0 R09: 00007f5b2961a4c0 R10: 0000000029690010 R11: 0000000000000246 R12: 0000000000400730 R13: 00007ffc2b59cf40 R14: 0000000000000000 R15: 0000000000000000 Kernel panic - not syncing: kernel: panic_on_warn set ... This is trigger as below: ns0 ns1 tun_set_iff() //dev is tun0 tun->dev = dev //ip link set tun0 netns ns1 put_net() //ref is 0 __tun_chr_ioctl() //TUNGETDEVNETNS net = dev_net(tun->dev); open_related_ns(&net->ns, get_net_ns); //ns1 get_net_ns() get_net() //addition on 0 Use maybe_get_net() in get_net_ns in case net's ref is zero to fix this Fixes: 0c3e0e3bb623 ("tun: Add ioctl() TUNGETDEVNETNS cmd to allow obtaining real net ns of tun device") Signed-off-by: Yue Haibing Link: https://lore.kernel.org/r/20240614131302.2698509-1-yuehaibing@huawei.com Signed-off-by: Paolo Abeni --- net/core/net_namespace.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 4f7a61688d189..6a823ba906c65 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -693,11 +693,16 @@ EXPORT_SYMBOL_GPL(__put_net); * get_net_ns - increment the refcount of the network namespace * @ns: common namespace (net) * - * Returns the net's common namespace. + * Returns the net's common namespace or ERR_PTR() if ref is zero. */ struct ns_common *get_net_ns(struct ns_common *ns) { - return &get_net(container_of(ns, struct net, ns))->ns; + struct net *net; + + net = maybe_get_net(container_of(ns, struct net, ns)); + if (net) + return &net->ns; + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(get_net_ns); From 2d7198278ece01818cd95a3beffbdf8b2a353fa0 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Fri, 14 Jun 2024 16:50:30 +0200 Subject: [PATCH 148/272] qca_spi: Make interrupt remembering atomic The whole mechanism to remember occurred SPI interrupts is not atomic, which could lead to unexpected behavior. So fix this by using atomic bit operations instead. Fixes: 291ab06ecf67 ("net: qualcomm: new Ethernet over SPI driver for QCA7000") Signed-off-by: Stefan Wahren Link: https://lore.kernel.org/r/20240614145030.7781-1-wahrenst@gmx.net Signed-off-by: Paolo Abeni --- drivers/net/ethernet/qualcomm/qca_debug.c | 6 ++---- drivers/net/ethernet/qualcomm/qca_spi.c | 16 ++++++++-------- drivers/net/ethernet/qualcomm/qca_spi.h | 3 +-- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/qualcomm/qca_debug.c b/drivers/net/ethernet/qualcomm/qca_debug.c index ff3b89e9028e9..ad06da0fdaa04 100644 --- a/drivers/net/ethernet/qualcomm/qca_debug.c +++ b/drivers/net/ethernet/qualcomm/qca_debug.c @@ -98,10 +98,8 @@ qcaspi_info_show(struct seq_file *s, void *what) seq_printf(s, "IRQ : %d\n", qca->spi_dev->irq); - seq_printf(s, "INTR REQ : %u\n", - qca->intr_req); - seq_printf(s, "INTR SVC : %u\n", - qca->intr_svc); + seq_printf(s, "INTR : %lx\n", + qca->intr); seq_printf(s, "SPI max speed : %lu\n", (unsigned long)qca->spi_dev->max_speed_hz); diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c index 5799ecc88a875..8f7ce6b51a1c9 100644 --- a/drivers/net/ethernet/qualcomm/qca_spi.c +++ b/drivers/net/ethernet/qualcomm/qca_spi.c @@ -35,6 +35,8 @@ #define MAX_DMA_BURST_LEN 5000 +#define SPI_INTR 0 + /* Modules parameters */ #define QCASPI_CLK_SPEED_MIN 1000000 #define QCASPI_CLK_SPEED_MAX 16000000 @@ -579,14 +581,14 @@ qcaspi_spi_thread(void *data) continue; } - if ((qca->intr_req == qca->intr_svc) && + if (!test_bit(SPI_INTR, &qca->intr) && !qca->txr.skb[qca->txr.head]) schedule(); set_current_state(TASK_RUNNING); - netdev_dbg(qca->net_dev, "have work to do. int: %d, tx_skb: %p\n", - qca->intr_req - qca->intr_svc, + netdev_dbg(qca->net_dev, "have work to do. int: %lu, tx_skb: %p\n", + qca->intr, qca->txr.skb[qca->txr.head]); qcaspi_qca7k_sync(qca, QCASPI_EVENT_UPDATE); @@ -600,8 +602,7 @@ qcaspi_spi_thread(void *data) msleep(QCASPI_QCA7K_REBOOT_TIME_MS); } - if (qca->intr_svc != qca->intr_req) { - qca->intr_svc = qca->intr_req; + if (test_and_clear_bit(SPI_INTR, &qca->intr)) { start_spi_intr_handling(qca, &intr_cause); if (intr_cause & SPI_INT_CPU_ON) { @@ -663,7 +664,7 @@ qcaspi_intr_handler(int irq, void *data) { struct qcaspi *qca = data; - qca->intr_req++; + set_bit(SPI_INTR, &qca->intr); if (qca->spi_thread) wake_up_process(qca->spi_thread); @@ -679,8 +680,7 @@ qcaspi_netdev_open(struct net_device *dev) if (!qca) return -EINVAL; - qca->intr_req = 1; - qca->intr_svc = 0; + set_bit(SPI_INTR, &qca->intr); qca->sync = QCASPI_SYNC_UNKNOWN; qcafrm_fsm_init_spi(&qca->frm_handle); diff --git a/drivers/net/ethernet/qualcomm/qca_spi.h b/drivers/net/ethernet/qualcomm/qca_spi.h index d59cb2352ceec..8f4808695e820 100644 --- a/drivers/net/ethernet/qualcomm/qca_spi.h +++ b/drivers/net/ethernet/qualcomm/qca_spi.h @@ -81,8 +81,7 @@ struct qcaspi { struct qcafrm_handle frm_handle; struct sk_buff *rx_skb; - unsigned int intr_req; - unsigned int intr_svc; + unsigned long intr; u16 reset_count; #ifdef CONFIG_DEBUG_FS From 8039156e23bc07a0039168266dbe68b30cddf7b2 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Mon, 17 Jun 2024 18:37:09 -0700 Subject: [PATCH 149/272] sound/oss/dmasound: add missing MODULE_DESCRIPTION() macro With ARCH=m68k, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in sound/oss/dmasound/dmasound_core.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Reviewed-by: Geert Uytterhoeven Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/20240617-md-m68k-sound-oss-dmasound-v1-1-5c19306be930@quicinc.com --- sound/oss/dmasound/dmasound_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/oss/dmasound/dmasound_core.c b/sound/oss/dmasound/dmasound_core.c index 164335d3c2009..4b1baf4dd50eb 100644 --- a/sound/oss/dmasound/dmasound_core.c +++ b/sound/oss/dmasound/dmasound_core.c @@ -204,6 +204,7 @@ module_param(numWriteBufs, int, 0); static unsigned int writeBufSize = DEFAULT_BUFF_SIZE ; /* in bytes */ module_param(writeBufSize, int, 0); +MODULE_DESCRIPTION("Atari/Amiga/Q40 core DMA sound driver"); MODULE_LICENSE("GPL"); static int sq_unit = -1; From 70794b9563fe011988bcf6a081af9777e63e8d37 Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Tue, 18 Jun 2024 14:16:04 +0800 Subject: [PATCH 150/272] ALSA: hda/realtek: Add more codec ID to no shutup pins list If it enter to runtime D3 state, it didn't shutup Headset MIC pin. Signed-off-by: Kailang Yang Link: https://lore.kernel.org/r/8d86f61e7d6f4a03b311e4eb4e5caaef@realtek.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index fd337fdd30f69..e2dbcf8f5bcfb 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -583,10 +583,14 @@ static void alc_shutup_pins(struct hda_codec *codec) switch (codec->core.vendor_id) { case 0x10ec0236: case 0x10ec0256: + case 0x10ec0257: case 0x19e58326: case 0x10ec0283: + case 0x10ec0285: case 0x10ec0286: + case 0x10ec0287: case 0x10ec0288: + case 0x10ec0295: case 0x10ec0298: alc_headset_mic_no_shutup(codec); break; From 7725363936a88351b71495774c1e0e852ae4cdca Mon Sep 17 00:00:00 2001 From: Raju Lakkaraju Date: Fri, 14 Jun 2024 22:41:55 +0530 Subject: [PATCH 151/272] net: lan743x: disable WOL upon resume to restore full data path operation When Wake-on-LAN (WoL) is active and the system is in suspend mode, triggering a system event can wake the system from sleep, which may block the data path. To restore normal data path functionality after waking, disable all wake-up events. Furthermore, clear all Write 1 to Clear (W1C) status bits by writing 1's to them. Fixes: 4d94282afd95 ("lan743x: Add power management support") Reviewed-by: Wojciech Drewek Signed-off-by: Raju Lakkaraju Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microchip/lan743x_main.c | 30 ++++++++++++++++--- drivers/net/ethernet/microchip/lan743x_main.h | 24 +++++++++++++++ 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 6be8a43c908a8..48835bdc2e63a 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -3575,7 +3575,7 @@ static void lan743x_pm_set_wol(struct lan743x_adapter *adapter) /* clear wake settings */ pmtctl = lan743x_csr_read(adapter, PMT_CTL); - pmtctl |= PMT_CTL_WUPS_MASK_; + pmtctl |= PMT_CTL_WUPS_MASK_ | PMT_CTL_RES_CLR_WKP_MASK_; pmtctl &= ~(PMT_CTL_GPIO_WAKEUP_EN_ | PMT_CTL_EEE_WAKEUP_EN_ | PMT_CTL_WOL_EN_ | PMT_CTL_MAC_D3_RX_CLK_OVR_ | PMT_CTL_RX_FCT_RFE_D3_CLK_OVR_ | PMT_CTL_ETH_PHY_WAKE_EN_); @@ -3710,6 +3710,7 @@ static int lan743x_pm_resume(struct device *dev) struct pci_dev *pdev = to_pci_dev(dev); struct net_device *netdev = pci_get_drvdata(pdev); struct lan743x_adapter *adapter = netdev_priv(netdev); + u32 data; int ret; pci_set_power_state(pdev, PCI_D0); @@ -3728,6 +3729,30 @@ static int lan743x_pm_resume(struct device *dev) return ret; } + ret = lan743x_csr_read(adapter, MAC_WK_SRC); + netif_dbg(adapter, drv, adapter->netdev, + "Wakeup source : 0x%08X\n", ret); + + /* Clear the wol configuration and status bits. Note that + * the status bits are "Write One to Clear (W1C)" + */ + data = MAC_WUCSR_EEE_TX_WAKE_ | MAC_WUCSR_EEE_RX_WAKE_ | + MAC_WUCSR_RFE_WAKE_FR_ | MAC_WUCSR_PFDA_FR_ | MAC_WUCSR_WUFR_ | + MAC_WUCSR_MPR_ | MAC_WUCSR_BCAST_FR_; + lan743x_csr_write(adapter, MAC_WUCSR, data); + + data = MAC_WUCSR2_NS_RCD_ | MAC_WUCSR2_ARP_RCD_ | + MAC_WUCSR2_IPV6_TCPSYN_RCD_ | MAC_WUCSR2_IPV4_TCPSYN_RCD_; + lan743x_csr_write(adapter, MAC_WUCSR2, data); + + data = MAC_WK_SRC_ETH_PHY_WK_ | MAC_WK_SRC_IPV6_TCPSYN_RCD_WK_ | + MAC_WK_SRC_IPV4_TCPSYN_RCD_WK_ | MAC_WK_SRC_EEE_TX_WK_ | + MAC_WK_SRC_EEE_RX_WK_ | MAC_WK_SRC_RFE_FR_WK_ | + MAC_WK_SRC_PFDA_FR_WK_ | MAC_WK_SRC_MP_FR_WK_ | + MAC_WK_SRC_BCAST_FR_WK_ | MAC_WK_SRC_WU_FR_WK_ | + MAC_WK_SRC_WK_FR_SAVED_; + lan743x_csr_write(adapter, MAC_WK_SRC, data); + /* open netdev when netdev is at running state while resume. * For instance, it is true when system wakesup after pm-suspend * However, it is false when system wakes up after suspend GUI menu @@ -3736,9 +3761,6 @@ static int lan743x_pm_resume(struct device *dev) lan743x_netdev_open(netdev); netif_device_attach(netdev); - ret = lan743x_csr_read(adapter, MAC_WK_SRC); - netif_info(adapter, drv, adapter->netdev, - "Wakeup source : 0x%08X\n", ret); return 0; } diff --git a/drivers/net/ethernet/microchip/lan743x_main.h b/drivers/net/ethernet/microchip/lan743x_main.h index 645bc048e52ef..fac0f33d10b2e 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.h +++ b/drivers/net/ethernet/microchip/lan743x_main.h @@ -61,6 +61,7 @@ #define PMT_CTL_RX_FCT_RFE_D3_CLK_OVR_ BIT(18) #define PMT_CTL_GPIO_WAKEUP_EN_ BIT(15) #define PMT_CTL_EEE_WAKEUP_EN_ BIT(13) +#define PMT_CTL_RES_CLR_WKP_MASK_ GENMASK(9, 8) #define PMT_CTL_READY_ BIT(7) #define PMT_CTL_ETH_PHY_RST_ BIT(4) #define PMT_CTL_WOL_EN_ BIT(3) @@ -227,12 +228,31 @@ #define MAC_WUCSR (0x140) #define MAC_MP_SO_EN_ BIT(21) #define MAC_WUCSR_RFE_WAKE_EN_ BIT(14) +#define MAC_WUCSR_EEE_TX_WAKE_ BIT(13) +#define MAC_WUCSR_EEE_RX_WAKE_ BIT(11) +#define MAC_WUCSR_RFE_WAKE_FR_ BIT(9) +#define MAC_WUCSR_PFDA_FR_ BIT(7) +#define MAC_WUCSR_WUFR_ BIT(6) +#define MAC_WUCSR_MPR_ BIT(5) +#define MAC_WUCSR_BCAST_FR_ BIT(4) #define MAC_WUCSR_PFDA_EN_ BIT(3) #define MAC_WUCSR_WAKE_EN_ BIT(2) #define MAC_WUCSR_MPEN_ BIT(1) #define MAC_WUCSR_BCST_EN_ BIT(0) #define MAC_WK_SRC (0x144) +#define MAC_WK_SRC_ETH_PHY_WK_ BIT(17) +#define MAC_WK_SRC_IPV6_TCPSYN_RCD_WK_ BIT(16) +#define MAC_WK_SRC_IPV4_TCPSYN_RCD_WK_ BIT(15) +#define MAC_WK_SRC_EEE_TX_WK_ BIT(14) +#define MAC_WK_SRC_EEE_RX_WK_ BIT(13) +#define MAC_WK_SRC_RFE_FR_WK_ BIT(12) +#define MAC_WK_SRC_PFDA_FR_WK_ BIT(11) +#define MAC_WK_SRC_MP_FR_WK_ BIT(10) +#define MAC_WK_SRC_BCAST_FR_WK_ BIT(9) +#define MAC_WK_SRC_WU_FR_WK_ BIT(8) +#define MAC_WK_SRC_WK_FR_SAVED_ BIT(7) + #define MAC_MP_SO_HI (0x148) #define MAC_MP_SO_LO (0x14C) @@ -295,6 +315,10 @@ #define RFE_INDX(index) (0x580 + (index << 2)) #define MAC_WUCSR2 (0x600) +#define MAC_WUCSR2_NS_RCD_ BIT(7) +#define MAC_WUCSR2_ARP_RCD_ BIT(6) +#define MAC_WUCSR2_IPV6_TCPSYN_RCD_ BIT(5) +#define MAC_WUCSR2_IPV4_TCPSYN_RCD_ BIT(4) #define SGMII_ACC (0x720) #define SGMII_ACC_SGMII_BZY_ BIT(31) From 8c248cd836014339498486f14f435c0e344183a7 Mon Sep 17 00:00:00 2001 From: Raju Lakkaraju Date: Fri, 14 Jun 2024 22:41:56 +0530 Subject: [PATCH 152/272] net: lan743x: Support WOL at both the PHY and MAC appropriately Prevent options not supported by the PHY from being requested to it by the MAC Whenever a WOL option is supported by both, the PHY is given priority since that usually leads to better power savings. Fixes: e9e13b6adc33 ("lan743x: fix for potential NULL pointer dereference with bare card") Reviewed-by: Wojciech Drewek Signed-off-by: Raju Lakkaraju Signed-off-by: Paolo Abeni --- .../net/ethernet/microchip/lan743x_ethtool.c | 44 +++++++++++++++++-- drivers/net/ethernet/microchip/lan743x_main.c | 18 ++++++-- drivers/net/ethernet/microchip/lan743x_main.h | 4 ++ 3 files changed, 58 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_ethtool.c b/drivers/net/ethernet/microchip/lan743x_ethtool.c index d0f4ff4ee0759..0d1740d646769 100644 --- a/drivers/net/ethernet/microchip/lan743x_ethtool.c +++ b/drivers/net/ethernet/microchip/lan743x_ethtool.c @@ -1127,8 +1127,12 @@ static void lan743x_ethtool_get_wol(struct net_device *netdev, if (netdev->phydev) phy_ethtool_get_wol(netdev->phydev, wol); - wol->supported |= WAKE_BCAST | WAKE_UCAST | WAKE_MCAST | - WAKE_MAGIC | WAKE_PHY | WAKE_ARP; + if (wol->supported != adapter->phy_wol_supported) + netif_warn(adapter, drv, adapter->netdev, + "PHY changed its supported WOL! old=%x, new=%x\n", + adapter->phy_wol_supported, wol->supported); + + wol->supported |= MAC_SUPPORTED_WAKES; if (adapter->is_pci11x1x) wol->supported |= WAKE_MAGICSECURE; @@ -1143,7 +1147,39 @@ static int lan743x_ethtool_set_wol(struct net_device *netdev, { struct lan743x_adapter *adapter = netdev_priv(netdev); + /* WAKE_MAGICSEGURE is a modifier of and only valid together with + * WAKE_MAGIC + */ + if ((wol->wolopts & WAKE_MAGICSECURE) && !(wol->wolopts & WAKE_MAGIC)) + return -EINVAL; + + if (netdev->phydev) { + struct ethtool_wolinfo phy_wol; + int ret; + + phy_wol.wolopts = wol->wolopts & adapter->phy_wol_supported; + + /* If WAKE_MAGICSECURE was requested, filter out WAKE_MAGIC + * for PHYs that do not support WAKE_MAGICSECURE + */ + if (wol->wolopts & WAKE_MAGICSECURE && + !(adapter->phy_wol_supported & WAKE_MAGICSECURE)) + phy_wol.wolopts &= ~WAKE_MAGIC; + + ret = phy_ethtool_set_wol(netdev->phydev, &phy_wol); + if (ret && (ret != -EOPNOTSUPP)) + return ret; + + if (ret == -EOPNOTSUPP) + adapter->phy_wolopts = 0; + else + adapter->phy_wolopts = phy_wol.wolopts; + } else { + adapter->phy_wolopts = 0; + } + adapter->wolopts = 0; + wol->wolopts &= ~adapter->phy_wolopts; if (wol->wolopts & WAKE_UCAST) adapter->wolopts |= WAKE_UCAST; if (wol->wolopts & WAKE_MCAST) @@ -1164,10 +1200,10 @@ static int lan743x_ethtool_set_wol(struct net_device *netdev, memset(adapter->sopass, 0, sizeof(u8) * SOPASS_MAX); } + wol->wolopts = adapter->wolopts | adapter->phy_wolopts; device_set_wakeup_enable(&adapter->pdev->dev, (bool)wol->wolopts); - return netdev->phydev ? phy_ethtool_set_wol(netdev->phydev, wol) - : -ENETDOWN; + return 0; } #endif /* CONFIG_PM */ diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 48835bdc2e63a..e418539565b18 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -3118,6 +3118,17 @@ static int lan743x_netdev_open(struct net_device *netdev) if (ret) goto close_tx; } + +#ifdef CONFIG_PM + if (adapter->netdev->phydev) { + struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; + + phy_ethtool_get_wol(netdev->phydev, &wol); + adapter->phy_wol_supported = wol.supported; + adapter->phy_wolopts = wol.wolopts; + } +#endif + return 0; close_tx: @@ -3587,10 +3598,9 @@ static void lan743x_pm_set_wol(struct lan743x_adapter *adapter) pmtctl |= PMT_CTL_ETH_PHY_D3_COLD_OVR_ | PMT_CTL_ETH_PHY_D3_OVR_; - if (adapter->wolopts & WAKE_PHY) { - pmtctl |= PMT_CTL_ETH_PHY_EDPD_PLL_CTL_; + if (adapter->phy_wolopts) pmtctl |= PMT_CTL_ETH_PHY_WAKE_EN_; - } + if (adapter->wolopts & WAKE_MAGIC) { wucsr |= MAC_WUCSR_MPEN_; macrx |= MAC_RX_RXEN_; @@ -3686,7 +3696,7 @@ static int lan743x_pm_suspend(struct device *dev) lan743x_csr_write(adapter, MAC_WUCSR2, 0); lan743x_csr_write(adapter, MAC_WK_SRC, 0xFFFFFFFF); - if (adapter->wolopts) + if (adapter->wolopts || adapter->phy_wolopts) lan743x_pm_set_wol(adapter); if (adapter->is_pci11x1x) { diff --git a/drivers/net/ethernet/microchip/lan743x_main.h b/drivers/net/ethernet/microchip/lan743x_main.h index fac0f33d10b2e..3b2585a384e2c 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.h +++ b/drivers/net/ethernet/microchip/lan743x_main.h @@ -1042,6 +1042,8 @@ enum lan743x_sgmii_lsd { LINK_2500_SLAVE }; +#define MAC_SUPPORTED_WAKES (WAKE_BCAST | WAKE_UCAST | WAKE_MCAST | \ + WAKE_MAGIC | WAKE_ARP) struct lan743x_adapter { struct net_device *netdev; struct mii_bus *mdiobus; @@ -1049,6 +1051,8 @@ struct lan743x_adapter { #ifdef CONFIG_PM u32 wolopts; u8 sopass[SOPASS_MAX]; + u32 phy_wolopts; + u32 phy_wol_supported; #endif struct pci_dev *pdev; struct lan743x_csr csr; From c44d3ffd85db03ebcc3090e55589e10d5af9f3a9 Mon Sep 17 00:00:00 2001 From: Raju Lakkaraju Date: Fri, 14 Jun 2024 22:41:57 +0530 Subject: [PATCH 153/272] net: phy: mxl-gpy: Remove interrupt mask clearing from config_init When the system resumes from sleep, the phy_init_hw() function invokes config_init(), which clears all interrupt masks and causes wake events to be lost in subsequent wake sequences. Remove interrupt mask clearing from config_init() and preserve relevant masks in config_intr(). Fixes: 7d901a1e878a ("net: phy: add Maxlinear GPY115/21x/24x driver") Reviewed-by: Wojciech Drewek Signed-off-by: Raju Lakkaraju Signed-off-by: Paolo Abeni --- drivers/net/phy/mxl-gpy.c | 58 +++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/drivers/net/phy/mxl-gpy.c b/drivers/net/phy/mxl-gpy.c index b2d36a3a96f1e..e5f8ac4b4604b 100644 --- a/drivers/net/phy/mxl-gpy.c +++ b/drivers/net/phy/mxl-gpy.c @@ -107,6 +107,7 @@ struct gpy_priv { u8 fw_major; u8 fw_minor; + u32 wolopts; /* It takes 3 seconds to fully switch out of loopback mode before * it can safely re-enter loopback mode. Record the time when @@ -221,6 +222,15 @@ static int gpy_hwmon_register(struct phy_device *phydev) } #endif +static int gpy_ack_interrupt(struct phy_device *phydev) +{ + int ret; + + /* Clear all pending interrupts */ + ret = phy_read(phydev, PHY_ISTAT); + return ret < 0 ? ret : 0; +} + static int gpy_mbox_read(struct phy_device *phydev, u32 addr) { struct gpy_priv *priv = phydev->priv; @@ -262,16 +272,8 @@ static int gpy_mbox_read(struct phy_device *phydev, u32 addr) static int gpy_config_init(struct phy_device *phydev) { - int ret; - - /* Mask all interrupts */ - ret = phy_write(phydev, PHY_IMASK, 0); - if (ret) - return ret; - - /* Clear all pending interrupts */ - ret = phy_read(phydev, PHY_ISTAT); - return ret < 0 ? ret : 0; + /* Nothing to configure. Configuration Requirement Placeholder */ + return 0; } static int gpy21x_config_init(struct phy_device *phydev) @@ -627,11 +629,23 @@ static int gpy_read_status(struct phy_device *phydev) static int gpy_config_intr(struct phy_device *phydev) { + struct gpy_priv *priv = phydev->priv; u16 mask = 0; + int ret; + + ret = gpy_ack_interrupt(phydev); + if (ret) + return ret; if (phydev->interrupts == PHY_INTERRUPT_ENABLED) mask = PHY_IMASK_MASK; + if (priv->wolopts & WAKE_MAGIC) + mask |= PHY_IMASK_WOL; + + if (priv->wolopts & WAKE_PHY) + mask |= PHY_IMASK_LSTC; + return phy_write(phydev, PHY_IMASK, mask); } @@ -678,6 +692,7 @@ static int gpy_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol) { struct net_device *attach_dev = phydev->attached_dev; + struct gpy_priv *priv = phydev->priv; int ret; if (wol->wolopts & WAKE_MAGIC) { @@ -725,6 +740,8 @@ static int gpy_set_wol(struct phy_device *phydev, ret = phy_read(phydev, PHY_ISTAT); if (ret < 0) return ret; + + priv->wolopts |= WAKE_MAGIC; } else { /* Disable magic packet matching */ ret = phy_clear_bits_mmd(phydev, MDIO_MMD_VEND2, @@ -732,6 +749,13 @@ static int gpy_set_wol(struct phy_device *phydev, WOL_EN); if (ret < 0) return ret; + + /* Disable the WOL interrupt */ + ret = phy_clear_bits(phydev, PHY_IMASK, PHY_IMASK_WOL); + if (ret < 0) + return ret; + + priv->wolopts &= ~WAKE_MAGIC; } if (wol->wolopts & WAKE_PHY) { @@ -748,9 +772,11 @@ static int gpy_set_wol(struct phy_device *phydev, if (ret & (PHY_IMASK_MASK & ~PHY_IMASK_LSTC)) phy_trigger_machine(phydev); + priv->wolopts |= WAKE_PHY; return 0; } + priv->wolopts &= ~WAKE_PHY; /* Disable the link state change interrupt */ return phy_clear_bits(phydev, PHY_IMASK, PHY_IMASK_LSTC); } @@ -758,18 +784,10 @@ static int gpy_set_wol(struct phy_device *phydev, static void gpy_get_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol) { - int ret; + struct gpy_priv *priv = phydev->priv; wol->supported = WAKE_MAGIC | WAKE_PHY; - wol->wolopts = 0; - - ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, VPSPEC2_WOL_CTL); - if (ret & WOL_EN) - wol->wolopts |= WAKE_MAGIC; - - ret = phy_read(phydev, PHY_IMASK); - if (ret & PHY_IMASK_LSTC) - wol->wolopts |= WAKE_PHY; + wol->wolopts = priv->wolopts; } static int gpy_loopback(struct phy_device *phydev, bool enable) From d864319871b05fadd153e0aede4811ca7008f5d6 Mon Sep 17 00:00:00 2001 From: David Ruth Date: Fri, 14 Jun 2024 19:03:26 +0000 Subject: [PATCH 154/272] net/sched: act_api: fix possible infinite loop in tcf_idr_check_alloc() syzbot found hanging tasks waiting on rtnl_lock [1] A reproducer is available in the syzbot bug. When a request to add multiple actions with the same index is sent, the second request will block forever on the first request. This holds rtnl_lock, and causes tasks to hang. Return -EAGAIN to prevent infinite looping, while keeping documented behavior. [1] INFO: task kworker/1:0:5088 blocked for more than 143 seconds. Not tainted 6.9.0-rc4-syzkaller-00173-g3cdb45594619 #0 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:kworker/1:0 state:D stack:23744 pid:5088 tgid:5088 ppid:2 flags:0x00004000 Workqueue: events_power_efficient reg_check_chans_work Call Trace: context_switch kernel/sched/core.c:5409 [inline] __schedule+0xf15/0x5d00 kernel/sched/core.c:6746 __schedule_loop kernel/sched/core.c:6823 [inline] schedule+0xe7/0x350 kernel/sched/core.c:6838 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:6895 __mutex_lock_common kernel/locking/mutex.c:684 [inline] __mutex_lock+0x5b8/0x9c0 kernel/locking/mutex.c:752 wiphy_lock include/net/cfg80211.h:5953 [inline] reg_leave_invalid_chans net/wireless/reg.c:2466 [inline] reg_check_chans_work+0x10a/0x10e0 net/wireless/reg.c:2481 Fixes: 0190c1d452a9 ("net: sched: atomically check-allocate action") Reported-by: syzbot+b87c222546179f4513a7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b87c222546179f4513a7 Signed-off-by: David Ruth Reviewed-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20240614190326.1349786-1-druth@chromium.org Signed-off-by: Paolo Abeni --- net/sched/act_api.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9ee622fb1160f..2520708b06a12 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -830,7 +830,6 @@ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, u32 max; if (*index) { -again: rcu_read_lock(); p = idr_find(&idrinfo->action_idr, *index); @@ -839,7 +838,7 @@ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, * index but did not assign the pointer yet. */ rcu_read_unlock(); - goto again; + return -EAGAIN; } if (!p) { From 2ebe8f840c7450ecbfca9d18ac92e9ce9155e269 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 15 Jun 2024 14:27:20 -0400 Subject: [PATCH 155/272] tipc: force a dst refcount before doing decryption As it says in commit 3bc07321ccc2 ("xfrm: Force a dst refcount before entering the xfrm type handlers"): "Crypto requests might return asynchronous. In this case we leave the rcu protected region, so force a refcount on the skb's destination entry before we enter the xfrm type input/output handlers." On TIPC decryption path it has the same problem, and skb_dst_force() should be called before doing decryption to avoid a possible crash. Shuang reported this issue when this warning is triggered: [] WARNING: include/net/dst.h:337 tipc_sk_rcv+0x1055/0x1ea0 [tipc] [] Kdump: loaded Tainted: G W --------- - - 4.18.0-496.el8.x86_64+debug [] Workqueue: crypto cryptd_queue_worker [] RIP: 0010:tipc_sk_rcv+0x1055/0x1ea0 [tipc] [] Call Trace: [] tipc_sk_mcast_rcv+0x548/0xea0 [tipc] [] tipc_rcv+0xcf5/0x1060 [tipc] [] tipc_aead_decrypt_done+0x215/0x2e0 [tipc] [] cryptd_aead_crypt+0xdb/0x190 [] cryptd_queue_worker+0xed/0x190 [] process_one_work+0x93d/0x17e0 Fixes: fc1b6d6de220 ("tipc: introduce TIPC encryption & authentication") Reported-by: Shuang Li Signed-off-by: Xin Long Link: https://lore.kernel.org/r/fbe3195fad6997a4eec62d9bf076b2ad03ac336b.1718476040.git.lucien.xin@gmail.com Signed-off-by: Paolo Abeni --- net/tipc/node.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tipc/node.c b/net/tipc/node.c index c1e890a824347..500320e5ca479 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2105,6 +2105,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) } else { n = tipc_node_find_by_id(net, ehdr->id); } + skb_dst_force(skb); tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); if (!skb) return; From 88c67aeb14070bab61d3dd8be96c8b42ebcaf53a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 15 Jun 2024 17:47:30 -0400 Subject: [PATCH 156/272] sched: act_ct: add netns into the key of tcf_ct_flow_table zones_ht is a global hashtable for flow_table with zone as key. However, it does not consider netns when getting a flow_table from zones_ht in tcf_ct_init(), and it means an act_ct action in netns A may get a flow_table that belongs to netns B if it has the same zone value. In Shuang's test with the TOPO: tcf2_c <---> tcf2_sw1 <---> tcf2_sw2 <---> tcf2_s tcf2_sw1 and tcf2_sw2 saw the same flow and used the same flow table, which caused their ct entries entering unexpected states and the TCP connection not able to end normally. This patch fixes the issue simply by adding netns into the key of tcf_ct_flow_table so that an act_ct action gets a flow_table that belongs to its own netns in tcf_ct_init(). Note that for easy coding we don't use tcf_ct_flow_table.nf_ft.net, as the ct_ft is initialized after inserting it to the hashtable in tcf_ct_flow_table_get() and also it requires to implement several functions in rhashtable_params including hashfn, obj_hashfn and obj_cmpfn. Fixes: 64ff70b80fd4 ("net/sched: act_ct: Offload established connections to flow table") Reported-by: Shuang Li Signed-off-by: Xin Long Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/1db5b6cc6902c5fc6f8c6cbd85494a2008087be5.1718488050.git.lucien.xin@gmail.com Signed-off-by: Paolo Abeni --- net/sched/act_ct.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index baac083fd8f10..2a96d9c1db65b 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -41,21 +41,26 @@ static struct workqueue_struct *act_ct_wq; static struct rhashtable zones_ht; static DEFINE_MUTEX(zones_mutex); +struct zones_ht_key { + struct net *net; + u16 zone; +}; + struct tcf_ct_flow_table { struct rhash_head node; /* In zones tables */ struct rcu_work rwork; struct nf_flowtable nf_ft; refcount_t ref; - u16 zone; + struct zones_ht_key key; bool dying; }; static const struct rhashtable_params zones_params = { .head_offset = offsetof(struct tcf_ct_flow_table, node), - .key_offset = offsetof(struct tcf_ct_flow_table, zone), - .key_len = sizeof_field(struct tcf_ct_flow_table, zone), + .key_offset = offsetof(struct tcf_ct_flow_table, key), + .key_len = sizeof_field(struct tcf_ct_flow_table, key), .automatic_shrinking = true, }; @@ -316,11 +321,12 @@ static struct nf_flowtable_type flowtable_ct = { static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params) { + struct zones_ht_key key = { .net = net, .zone = params->zone }; struct tcf_ct_flow_table *ct_ft; int err = -ENOMEM; mutex_lock(&zones_mutex); - ct_ft = rhashtable_lookup_fast(&zones_ht, ¶ms->zone, zones_params); + ct_ft = rhashtable_lookup_fast(&zones_ht, &key, zones_params); if (ct_ft && refcount_inc_not_zero(&ct_ft->ref)) goto out_unlock; @@ -329,7 +335,7 @@ static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params) goto err_alloc; refcount_set(&ct_ft->ref, 1); - ct_ft->zone = params->zone; + ct_ft->key = key; err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params); if (err) goto err_insert; From c2bd0791c5f02e964402624dfff45ca8995f5397 Mon Sep 17 00:00:00 2001 From: Patrice Chotard Date: Tue, 18 Jun 2024 15:29:49 +0200 Subject: [PATCH 157/272] spi: stm32: qspi: Fix dual flash mode sanity test in stm32_qspi_setup() Misplaced parenthesis make test of mode wrong in case mode is equal to SPI_TX_OCTAL or SPI_RX_OCTAL. Simplify this sanity test, if one of this bit is set, property cs-gpio must be present in DT. Fixes: a557fca630cc ("spi: stm32_qspi: Add transfer_one_message() spi callback") Cc: stable@vger.kernel.org Signed-off-by: Patrice Chotard Link: https://msgid.link/r/20240618132951.2743935-2-patrice.chotard@foss.st.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32-qspi.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/spi/spi-stm32-qspi.c b/drivers/spi/spi-stm32-qspi.c index f1e922fd362ab..6944e85d83679 100644 --- a/drivers/spi/spi-stm32-qspi.c +++ b/drivers/spi/spi-stm32-qspi.c @@ -653,9 +653,7 @@ static int stm32_qspi_setup(struct spi_device *spi) return -EINVAL; mode = spi->mode & (SPI_TX_OCTAL | SPI_RX_OCTAL); - if ((mode == SPI_TX_OCTAL || mode == SPI_RX_OCTAL) || - ((mode == (SPI_TX_OCTAL | SPI_RX_OCTAL)) && - gpiod_count(qspi->dev, "cs") == -ENOENT)) { + if (mode && gpiod_count(qspi->dev, "cs") == -ENOENT) { dev_err(qspi->dev, "spi-rx-bus-width\\/spi-tx-bus-width\\/cs-gpios\n"); dev_err(qspi->dev, "configuration not supported\n"); @@ -676,10 +674,10 @@ static int stm32_qspi_setup(struct spi_device *spi) qspi->cr_reg = CR_APMS | 3 << CR_FTHRES_SHIFT | CR_SSHIFT | CR_EN; /* - * Dual flash mode is only enable in case SPI_TX_OCTAL and SPI_TX_OCTAL - * are both set in spi->mode and "cs-gpios" properties is found in DT + * Dual flash mode is only enable in case SPI_TX_OCTAL or SPI_RX_OCTAL + * is set in spi->mode and "cs-gpios" properties is found in DT */ - if (mode == (SPI_TX_OCTAL | SPI_RX_OCTAL)) { + if (mode) { qspi->cr_reg |= CR_DFM; dev_dbg(qspi->dev, "Dual flash mode enable"); } From 63deee52811b2f84ed2da55ad47252f0e8145d62 Mon Sep 17 00:00:00 2001 From: Patrice Chotard Date: Tue, 18 Jun 2024 15:29:50 +0200 Subject: [PATCH 158/272] spi: stm32: qspi: Clamp stm32_qspi_get_mode() output to CCR_BUSWIDTH_4 In case usage of OCTAL mode, buswidth parameter can take the value 8. As return value of stm32_qspi_get_mode() is used to configure fields of CCR registers that are 2 bits only (fields IMODE, ADMODE, ADSIZE, DMODE), clamp return value of stm32_qspi_get_mode() to 4. Fixes: a557fca630cc ("spi: stm32_qspi: Add transfer_one_message() spi callback") Cc: stable@vger.kernel.org Signed-off-by: Patrice Chotard Link: https://msgid.link/r/20240618132951.2743935-3-patrice.chotard@foss.st.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32-qspi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-stm32-qspi.c b/drivers/spi/spi-stm32-qspi.c index 6944e85d83679..955c920c4b639 100644 --- a/drivers/spi/spi-stm32-qspi.c +++ b/drivers/spi/spi-stm32-qspi.c @@ -349,7 +349,7 @@ static int stm32_qspi_wait_poll_status(struct stm32_qspi *qspi) static int stm32_qspi_get_mode(u8 buswidth) { - if (buswidth == 4) + if (buswidth >= 4) return CCR_BUSWIDTH_4; return buswidth; From d6a711a898672dd873aab3844f754a3ca40723a5 Mon Sep 17 00:00:00 2001 From: Patrice Chotard Date: Tue, 18 Jun 2024 15:29:51 +0200 Subject: [PATCH 159/272] spi: Fix OCTAL mode support Add OCTAL mode support. Issue detected using "--octal" spidev_test's option. Signed-off-by: Patrice Chotard Link: https://msgid.link/r/20240618132951.2743935-4-patrice.chotard@foss.st.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 6 ++++-- include/linux/spi/spi.h | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 9bc9fd10d538d..9da736d51a2ba 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -4156,7 +4156,8 @@ static int __spi_validate(struct spi_device *spi, struct spi_message *message) return -EINVAL; if (xfer->tx_nbits != SPI_NBITS_SINGLE && xfer->tx_nbits != SPI_NBITS_DUAL && - xfer->tx_nbits != SPI_NBITS_QUAD) + xfer->tx_nbits != SPI_NBITS_QUAD && + xfer->tx_nbits != SPI_NBITS_OCTAL) return -EINVAL; if ((xfer->tx_nbits == SPI_NBITS_DUAL) && !(spi->mode & (SPI_TX_DUAL | SPI_TX_QUAD))) @@ -4171,7 +4172,8 @@ static int __spi_validate(struct spi_device *spi, struct spi_message *message) return -EINVAL; if (xfer->rx_nbits != SPI_NBITS_SINGLE && xfer->rx_nbits != SPI_NBITS_DUAL && - xfer->rx_nbits != SPI_NBITS_QUAD) + xfer->rx_nbits != SPI_NBITS_QUAD && + xfer->rx_nbits != SPI_NBITS_OCTAL) return -EINVAL; if ((xfer->rx_nbits == SPI_NBITS_DUAL) && !(spi->mode & (SPI_RX_DUAL | SPI_RX_QUAD))) diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e8e1e798924f4..98fdef6e28f2a 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -1085,12 +1085,13 @@ struct spi_transfer { unsigned dummy_data:1; unsigned cs_off:1; unsigned cs_change:1; - unsigned tx_nbits:3; - unsigned rx_nbits:3; + unsigned tx_nbits:4; + unsigned rx_nbits:4; unsigned timestamped:1; #define SPI_NBITS_SINGLE 0x01 /* 1-bit transfer */ #define SPI_NBITS_DUAL 0x02 /* 2-bit transfer */ #define SPI_NBITS_QUAD 0x04 /* 4-bit transfer */ +#define SPI_NBITS_OCTAL 0x08 /* 8-bit transfer */ u8 bits_per_word; struct spi_delay delay; struct spi_delay cs_change_delay; From c3f3edf73a8f854f8766a69d2734198a58762e33 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Wed, 12 Jun 2024 09:41:51 -0500 Subject: [PATCH 160/272] KVM: Stop processing *all* memslots when "null" mmu_notifier handler is found Bail from outer address space loop, not just the inner memslot loop, when a "null" handler is encountered by __kvm_handle_hva_range(), which is the intended behavior. On x86, which has multiple address spaces thanks to SMM emulation, breaking from just the memslot loop results in undefined behavior due to assigning the non-existent return value from kvm_null_fn() to a bool. In practice, the bug is benign as kvm_mmu_notifier_invalidate_range_end() is the only caller that passes handler=kvm_null_fn, and it doesn't set flush_on_ret, i.e. assigning garbage to r.ret is ultimately ignored. And for most configuration the compiler elides the entire sequence, i.e. there is no undefined behavior at runtime. ------------[ cut here ]------------ UBSAN: invalid-load in arch/x86/kvm/../../../virt/kvm/kvm_main.c:655:10 load of value 160 is not a valid value for type '_Bool' CPU: 370 PID: 8246 Comm: CPU 0/KVM Not tainted 6.8.2-amdsos-build58-ubuntu-22.04+ #1 Hardware name: AMD Corporation Sh54p/Sh54p, BIOS WPC4429N 04/25/2024 Call Trace: dump_stack_lvl+0x48/0x60 ubsan_epilogue+0x5/0x30 __ubsan_handle_load_invalid_value+0x79/0x80 kvm_mmu_notifier_invalidate_range_end.cold+0x18/0x4f [kvm] __mmu_notifier_invalidate_range_end+0x63/0xe0 __split_huge_pmd+0x367/0xfc0 do_huge_pmd_wp_page+0x1cc/0x380 __handle_mm_fault+0x8ee/0xe50 handle_mm_fault+0xe4/0x4a0 __get_user_pages+0x190/0x840 get_user_pages_unlocked+0xe0/0x590 hva_to_pfn+0x114/0x550 [kvm] kvm_faultin_pfn+0xed/0x5b0 [kvm] kvm_tdp_page_fault+0x123/0x170 [kvm] kvm_mmu_page_fault+0x244/0xaa0 [kvm] vcpu_enter_guest+0x592/0x1070 [kvm] kvm_arch_vcpu_ioctl_run+0x145/0x8a0 [kvm] kvm_vcpu_ioctl+0x288/0x6d0 [kvm] __x64_sys_ioctl+0x8f/0xd0 do_syscall_64+0x77/0x120 entry_SYSCALL_64_after_hwframe+0x6e/0x76 ---[ end trace ]--- Fixes: 071064f14d87 ("KVM: Don't take mmu_lock for range invalidation unless necessary") Signed-off-by: Babu Moger Link: https://lore.kernel.org/r/b8723d39903b64c241c50f5513f804390c7b5eec.1718203311.git.babu.moger@amd.com [sean: massage changelog] Signed-off-by: Sean Christopherson --- virt/kvm/kvm_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 843aa68cbcd05..68be4be5d846e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -651,7 +651,7 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, range->on_lock(kvm); if (IS_KVM_NULL_FN(range->handler)) - break; + goto mmu_unlock; } r.ret |= range->handler(kvm, &gfn_range); } @@ -660,6 +660,7 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, if (range->flush_on_ret && r.ret) kvm_flush_remote_tlbs(kvm); +mmu_unlock: if (r.found_memslot) KVM_MMU_UNLOCK(kvm); From 5d272dd1b3430bb31fa30042490fa081512424e4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 18 Jun 2024 09:00:04 -0700 Subject: [PATCH 161/272] cpumask: limit FORCE_NR_CPUS to just the UP case Hardcoding the number of CPUs at compile time does improve code generation, but if you get it wrong the result will be confusion. We already limited this earlier to only "experts" (see commit fe5759d5bfda "cpumask: limit visibility of FORCE_NR_CPUS"), but with distro kernel configs often having EXPERT enabled, that turns out to not be much of a limit. To quote the philosophers at Disney: "Everyone can be an expert. And when everyone's an expert, no one will be". There's a runtime warning if you then set nr_cpus to anything but the forced number, but apparently that can be ignored too [1] and by then it's pretty much too late anyway. If we had some real way to limit this to "embedded only", maybe it would be worth it, but let's see if anybody even notices that the option is gone. We need to simplify kernel configuration anyway. Link: https://lore.kernel.org/all/20240618105036.208a8860@rorschach.local.home/ [1] Reported-by: Steven Rostedt Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Paul McKenney Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Yury Norov Signed-off-by: Linus Torvalds --- lib/Kconfig | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index d33a268bc256e..b0a76dff5c182 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -539,13 +539,7 @@ config CPUMASK_OFFSTACK stack overflow. config FORCE_NR_CPUS - bool "Set number of CPUs at compile time" - depends on SMP && EXPERT && !COMPILE_TEST - help - Say Yes if you have NR_CPUS set to an actual number of possible - CPUs in your system, not to a default value. This forces the core - code to rely on compile-time value and optimize kernel routines - better. + def_bool !SMP config CPU_RMAP bool From 2c1b7bbe253986619fa5623a13055316e730e746 Mon Sep 17 00:00:00 2001 From: Amit Kumar Mahapatra Date: Mon, 17 Jun 2024 21:00:52 +0530 Subject: [PATCH 162/272] spi: Fix SPI slave probe failure While adding a SPI device, the SPI core ensures that multiple logical CS doesn't map to the same physical CS. For example, spi->chip_select[0] != spi->chip_select[1] and so forth. However, unlike the SPI master, the SPI slave doesn't have the list of chip selects, this leads to probe failure when the SPI controller is configured as slave. Update the __spi_add_device() function to perform this check only if the SPI controller is configured as master. Fixes: 4d8ff6b0991d ("spi: Add multi-cs memories support in SPI core") Signed-off-by: Amit Kumar Mahapatra Link: https://msgid.link/r/20240617153052.26636-1-amit.kumar-mahapatra@amd.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 9da736d51a2ba..fc13fa1921895 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -689,10 +689,12 @@ static int __spi_add_device(struct spi_device *spi) * Make sure that multiple logical CS doesn't map to the same physical CS. * For example, spi->chip_select[0] != spi->chip_select[1] and so on. */ - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { - status = spi_dev_check_cs(dev, spi, idx, spi, idx + 1); - if (status) - return status; + if (!spi_controller_is_target(ctlr)) { + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + status = spi_dev_check_cs(dev, spi, idx, spi, idx + 1); + if (status) + return status; + } } /* Set the bus ID string */ From 81d23d2a24012e448f651e007fac2cfd20a45ce0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 17 Jun 2024 12:34:32 +0300 Subject: [PATCH 163/272] ptp: fix integer overflow in max_vclocks_store On 32bit systems, the "4 * max" multiply can overflow. Use kcalloc() to do the allocation to prevent this. Fixes: 44c494c8e30e ("ptp: track available ptp vclocks information") Signed-off-by: Dan Carpenter Reviewed-by: Wojciech Drewek Reviewed-by: Jiri Pirko Reviewed-by: Heng Qi Link: https://lore.kernel.org/r/ee8110ed-6619-4bd7-9024-28c1f2ac24f4@moroto.mountain Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_sysfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/ptp/ptp_sysfs.c b/drivers/ptp/ptp_sysfs.c index a15460aaa03b3..6b1b8f57cd951 100644 --- a/drivers/ptp/ptp_sysfs.c +++ b/drivers/ptp/ptp_sysfs.c @@ -296,8 +296,7 @@ static ssize_t max_vclocks_store(struct device *dev, if (max < ptp->n_vclocks) goto out; - size = sizeof(int) * max; - vclock_index = kzalloc(size, GFP_KERNEL); + vclock_index = kcalloc(max, sizeof(int), GFP_KERNEL); if (!vclock_index) { err = -ENOMEM; goto out; From e2b447c9a1bba718f9c07513a1e8958209e862a1 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 17 Jun 2024 09:28:33 +0100 Subject: [PATCH 164/272] selftests: openvswitch: Use bash as interpreter openvswitch.sh makes use of substitutions of the form ${ns:0:1}, to obtain the first character of $ns. Empirically, this is works with bash but not dash. When run with dash these evaluate to an empty string and printing an error to stdout. # dash -c 'ns=client; echo "${ns:0:1}"' 2>error # cat error dash: 1: Bad substitution # bash -c 'ns=client; echo "${ns:0:1}"' 2>error c # cat error This leads to tests that neither pass nor fail. F.e. TEST: arp_ping [START] adding sandbox 'test_arp_ping' Adding DP/Bridge IF: sbx:test_arp_ping dp:arpping {, , } create namespaces ./openvswitch.sh: 282: eval: Bad substitution TEST: ct_connect_v4 [START] adding sandbox 'test_ct_connect_v4' Adding DP/Bridge IF: sbx:test_ct_connect_v4 dp:ct4 {, , } ./openvswitch.sh: 322: eval: Bad substitution create namespaces Resolve this by making openvswitch.sh a bash script. Fixes: 918423fda910 ("selftests: openvswitch: add an initial flow programming case") Signed-off-by: Simon Horman Reviewed-by: Przemek Kitszel Link: https://lore.kernel.org/r/20240617-ovs-selftest-bash-v1-1-7ae6ccd3617b@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/openvswitch/openvswitch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh index 5cae535438491..15bca07087179 100755 --- a/tools/testing/selftests/net/openvswitch/openvswitch.sh +++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # # OVS kernel module self tests From 890182bb3d00d49ccd70b0d651ad8342745a08e7 Mon Sep 17 00:00:00 2001 From: Haylen Chu Date: Tue, 18 Jun 2024 09:16:14 +0000 Subject: [PATCH 165/272] riscv: dts: sophgo: disable write-protection for milkv duo Milkv Duo does not have a write-protect pin, so disable write protect to prevent SDcards misdetected as read-only. Fixes: 89a7056ed4f7 ("riscv: dts: sophgo: add sdcard support for milkv duo") Signed-off-by: Haylen Chu Link: https://lore.kernel.org/r/SEYPR01MB4221943C7B101DD2318DA0D3D7CE2@SEYPR01MB4221.apcprd01.prod.exchangelabs.com Signed-off-by: Inochi Amaoto Signed-off-by: Chen Wang --- arch/riscv/boot/dts/sophgo/cv1800b-milkv-duo.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/boot/dts/sophgo/cv1800b-milkv-duo.dts b/arch/riscv/boot/dts/sophgo/cv1800b-milkv-duo.dts index cd013588adc0e..375ff2661b6e2 100644 --- a/arch/riscv/boot/dts/sophgo/cv1800b-milkv-duo.dts +++ b/arch/riscv/boot/dts/sophgo/cv1800b-milkv-duo.dts @@ -45,6 +45,7 @@ no-1-8-v; no-mmc; no-sdio; + disable-wp; }; &uart0 { From cd6f12e173df44a20c2ac2ac110007dc14968088 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 14 Jun 2024 11:45:15 +0200 Subject: [PATCH 166/272] net: phy: dp83tg720: wake up PHYs in managed mode In case this PHY is bootstrapped for managed mode, we need to manually wake it. Otherwise no link will be detected. Cc: stable@vger.kernel.org Fixes: cb80ee2f9bee1 ("net: phy: Add support for the DP83TG720S Ethernet PHY") Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20240614094516.1481231-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83tg720.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/dp83tg720.c b/drivers/net/phy/dp83tg720.c index 326c9770a6dcc..1186dfc70fb3c 100644 --- a/drivers/net/phy/dp83tg720.c +++ b/drivers/net/phy/dp83tg720.c @@ -17,6 +17,11 @@ #define DP83TG720S_PHY_RESET 0x1f #define DP83TG720S_HW_RESET BIT(15) +#define DP83TG720S_LPS_CFG3 0x18c +/* Power modes are documented as bit fields but used as values */ +/* Power Mode 0 is Normal mode */ +#define DP83TG720S_LPS_CFG3_PWR_MODE_0 BIT(0) + #define DP83TG720S_RGMII_DELAY_CTRL 0x602 /* In RGMII mode, Enable or disable the internal delay for RXD */ #define DP83TG720S_RGMII_RX_CLK_SEL BIT(1) @@ -154,10 +159,17 @@ static int dp83tg720_config_init(struct phy_device *phydev) */ usleep_range(1000, 2000); - if (phy_interface_is_rgmii(phydev)) - return dp83tg720_config_rgmii_delay(phydev); + if (phy_interface_is_rgmii(phydev)) { + ret = dp83tg720_config_rgmii_delay(phydev); + if (ret) + return ret; + } - return 0; + /* In case the PHY is bootstrapped in managed mode, we need to + * wake it. + */ + return phy_write_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_LPS_CFG3, + DP83TG720S_LPS_CFG3_PWR_MODE_0); } static struct phy_driver dp83tg720_driver[] = { From 40a64cc9679540ff7c46ecc51178b07d42abbb1c Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 14 Jun 2024 11:45:16 +0200 Subject: [PATCH 167/272] net: phy: dp83tg720: get master/slave configuration in link down state Get master/slave configuration for initial system start with the link in down state. This ensures ethtool shows current configuration. Also fixes link reconfiguration with ethtool while in down state, preventing ethtool from displaying outdated configuration. Even though dp83tg720_config_init() is executed periodically as long as the link is in admin up state but no carrier is detected, this is not sufficient for the link in admin down state where dp83tg720_read_status() is not periodically executed. To cover this case, we need an extra read role configuration in dp83tg720_config_aneg(). Fixes: cb80ee2f9bee1 ("net: phy: Add support for the DP83TG720S Ethernet PHY") Cc: stable@vger.kernel.org Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20240614094516.1481231-2-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83tg720.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/dp83tg720.c b/drivers/net/phy/dp83tg720.c index 1186dfc70fb3c..c706429b225a2 100644 --- a/drivers/net/phy/dp83tg720.c +++ b/drivers/net/phy/dp83tg720.c @@ -36,11 +36,20 @@ static int dp83tg720_config_aneg(struct phy_device *phydev) { + int ret; + /* Autoneg is not supported and this PHY supports only one speed. * We need to care only about master/slave configuration if it was * changed by user. */ - return genphy_c45_pma_baset1_setup_master_slave(phydev); + ret = genphy_c45_pma_baset1_setup_master_slave(phydev); + if (ret) + return ret; + + /* Re-read role configuration to make changes visible even if + * the link is in administrative down state. + */ + return genphy_c45_pma_baset1_read_master_slave(phydev); } static int dp83tg720_read_status(struct phy_device *phydev) @@ -69,6 +78,8 @@ static int dp83tg720_read_status(struct phy_device *phydev) return ret; /* After HW reset we need to restore master/slave configuration. + * genphy_c45_pma_baset1_read_master_slave() call will be done + * by the dp83tg720_config_aneg() function. */ ret = dp83tg720_config_aneg(phydev); if (ret) @@ -168,8 +179,15 @@ static int dp83tg720_config_init(struct phy_device *phydev) /* In case the PHY is bootstrapped in managed mode, we need to * wake it. */ - return phy_write_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_LPS_CFG3, - DP83TG720S_LPS_CFG3_PWR_MODE_0); + ret = phy_write_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_LPS_CFG3, + DP83TG720S_LPS_CFG3_PWR_MODE_0); + if (ret) + return ret; + + /* Make role configuration visible for ethtool on init and after + * rest. + */ + return genphy_c45_pma_baset1_read_master_slave(phydev); } static struct phy_driver dp83tg720_driver[] = { From b8c43360f6e424131fa81d3ba8792ad8ff25a09e Mon Sep 17 00:00:00 2001 From: Xiaolei Wang Date: Mon, 17 Jun 2024 09:39:22 +0800 Subject: [PATCH 168/272] net: stmmac: No need to calculate speed divider when offload is disabled commit be27b8965297 ("net: stmmac: replace priv->speed with the portTransmitRate from the tc-cbs parameters") introduced a problem. When deleting, it prompts "Invalid portTransmitRate 0 (idleSlope - sendSlope)" and exits. Add judgment on cbs.enable. Only when offload is enabled, speed divider needs to be calculated. Fixes: be27b8965297 ("net: stmmac: replace priv->speed with the portTransmitRate from the tc-cbs parameters") Signed-off-by: Xiaolei Wang Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240617013922.1035854-1-xiaolei.wang@windriver.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/stmmac_tc.c | 40 ++++++++++--------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index 1562fbdd0a040..996f2bcd07a24 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -358,24 +358,28 @@ static int tc_setup_cbs(struct stmmac_priv *priv, port_transmit_rate_kbps = qopt->idleslope - qopt->sendslope; - /* Port Transmit Rate and Speed Divider */ - switch (div_s64(port_transmit_rate_kbps, 1000)) { - case SPEED_10000: - case SPEED_5000: - ptr = 32; - break; - case SPEED_2500: - case SPEED_1000: - ptr = 8; - break; - case SPEED_100: - ptr = 4; - break; - default: - netdev_err(priv->dev, - "Invalid portTransmitRate %lld (idleSlope - sendSlope)\n", - port_transmit_rate_kbps); - return -EINVAL; + if (qopt->enable) { + /* Port Transmit Rate and Speed Divider */ + switch (div_s64(port_transmit_rate_kbps, 1000)) { + case SPEED_10000: + case SPEED_5000: + ptr = 32; + break; + case SPEED_2500: + case SPEED_1000: + ptr = 8; + break; + case SPEED_100: + ptr = 4; + break; + default: + netdev_err(priv->dev, + "Invalid portTransmitRate %lld (idleSlope - sendSlope)\n", + port_transmit_rate_kbps); + return -EINVAL; + } + } else { + ptr = 0; } mode_to_use = priv->plat->tx_queues_cfg[queue].mode_to_use; From 29433a17a79caa8680b9c0761f2b10502fda9ce3 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 18 Jun 2024 19:22:58 +1200 Subject: [PATCH 169/272] cifs: drop the incorrect assertion in cifs_swap_rw() Since commit 2282679fb20b ("mm: submit multipage write for SWP_FS_OPS swap-space"), we can plug multiple pages then unplug them all together. That means iov_iter_count(iter) could be way bigger than PAGE_SIZE, it actually equals the size of iov_iter_npages(iter, INT_MAX). Note this issue has nothing to do with large folios as we don't support THP_SWPOUT to non-block devices. Fixes: 2282679fb20b ("mm: submit multipage write for SWP_FS_OPS swap-space") Reported-by: Christoph Hellwig Closes: https://lore.kernel.org/linux-mm/20240614100329.1203579-1-hch@lst.de/ Cc: NeilBrown Cc: Anna Schumaker Cc: Steve French Cc: Trond Myklebust Cc: Chuanhua Han Cc: Ryan Roberts Cc: Chris Li Cc: "Huang, Ying" Cc: Jeff Layton Cc: Paulo Alcantara Cc: Ronnie Sahlberg Cc: Shyam Prasad N Cc: Tom Talpey Cc: Bharath SM Cc: Signed-off-by: Barry Song Signed-off-by: Steve French --- fs/smb/client/file.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 9d5c2440abfc8..1e269e0bc75b3 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -3200,8 +3200,6 @@ static int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) { ssize_t ret; - WARN_ON_ONCE(iov_iter_count(iter) != PAGE_SIZE); - if (iov_iter_rw(iter) == READ) ret = netfs_unbuffered_read_iter_locked(iocb, iter); else From 739c9765793e5794578a64aab293c58607f1826a Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Tue, 18 Jun 2024 15:01:52 +0100 Subject: [PATCH 170/272] x86/resctrl: Don't try to free nonexistent RMIDs Commit 6791e0ea3071 ("x86/resctrl: Access per-rmid structures by index") adds logic to map individual monitoring groups into a global index space used for tracking allocated RMIDs. Attempts to free the default RMID are ignored in free_rmid(), and this works fine on x86. With arm64 MPAM, there is a latent bug here however: on platforms with no monitors exposed through resctrl, each control group still gets a different monitoring group ID as seen by the hardware, since the CLOSID always forms part of the monitoring group ID. This means that when removing a control group, the code may try to free this group's default monitoring group RMID for real. If there are no monitors however, the RMID tracking table rmid_ptrs[] would be a waste of memory and is never allocated, leading to a splat when free_rmid() tries to dereference the table. One option would be to treat RMID 0 as special for every CLOSID, but this would be ugly since bookkeeping still needs to be done for these monitoring group IDs when there are monitors present in the hardware. Instead, add a gating check of resctrl_arch_mon_capable() in free_rmid(), and just do nothing if the hardware doesn't have monitors. This fix mirrors the gating checks already present in mkdir_rdt_prepare_rmid_alloc() and elsewhere. No functional change on x86. [ bp: Massage commit message. ] Fixes: 6791e0ea3071 ("x86/resctrl: Access per-rmid structures by index") Signed-off-by: Dave Martin Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Reinette Chatre Link: https://lore.kernel.org/r/20240618140152.83154-1-Dave.Martin@arm.com --- arch/x86/kernel/cpu/resctrl/monitor.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 2345e6836593f..366f496ca3ce2 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -519,7 +519,8 @@ void free_rmid(u32 closid, u32 rmid) * allows architectures that ignore the closid parameter to avoid an * unnecessary check. */ - if (idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + if (!resctrl_arch_mon_capable() || + idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, RESCTRL_RESERVED_RMID)) return; From 7be4cb7189f747b4e5b6977d0e4387bde3204e62 Mon Sep 17 00:00:00 2001 From: Jose Ignacio Tornos Martinez Date: Mon, 17 Jun 2024 12:28:21 +0200 Subject: [PATCH 171/272] net: usb: ax88179_178a: improve reset check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After ecf848eb934b ("net: usb: ax88179_178a: fix link status when link is set to down/up") to not reset from usbnet_open after the reset from usbnet_probe at initialization stage to speed up this, some issues have been reported. It seems to happen that if the initialization is slower, and some time passes between the probe operation and the open operation, the second reset from open is necessary too to have the device working. The reason is that if there is no activity with the phy, this is "disconnected". In order to improve this, the solution is to detect when the phy is "disconnected", and we can use the phy status register for this. So we will only reset the device from reset operation in this situation, that is, only if necessary. The same bahavior is happening when the device is stopped (link set to down) and later is restarted (link set to up), so if the phy keeps working we only need to enable the mac again, but if enough time passes between the device stop and restart, reset is necessary, and we can detect the situation checking the phy status register too. cc: stable@vger.kernel.org # 6.6+ Fixes: ecf848eb934b ("net: usb: ax88179_178a: fix link status when link is set to down/up") Reported-by: Yongqin Liu Reported-by: Antje Miederhöfer Reported-by: Arne Fitzenreiter Tested-by: Yongqin Liu Tested-by: Antje Miederhöfer Signed-off-by: Jose Ignacio Tornos Martinez Signed-off-by: David S. Miller --- drivers/net/usb/ax88179_178a.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index 51c295e1e823a..c2fb736f78b26 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -174,7 +174,6 @@ struct ax88179_data { u32 wol_supported; u32 wolopts; u8 disconnecting; - u8 initialized; }; struct ax88179_int_data { @@ -1678,12 +1677,21 @@ static int ax88179_reset(struct usbnet *dev) static int ax88179_net_reset(struct usbnet *dev) { - struct ax88179_data *ax179_data = dev->driver_priv; + u16 tmp16; - if (ax179_data->initialized) + ax88179_read_cmd(dev, AX_ACCESS_PHY, AX88179_PHY_ID, GMII_PHY_PHYSR, + 2, &tmp16); + if (tmp16) { + ax88179_read_cmd(dev, AX_ACCESS_MAC, AX_MEDIUM_STATUS_MODE, + 2, 2, &tmp16); + if (!(tmp16 & AX_MEDIUM_RECEIVE_EN)) { + tmp16 |= AX_MEDIUM_RECEIVE_EN; + ax88179_write_cmd(dev, AX_ACCESS_MAC, AX_MEDIUM_STATUS_MODE, + 2, 2, &tmp16); + } + } else { ax88179_reset(dev); - else - ax179_data->initialized = 1; + } return 0; } From 604141c036e1b636e2a71cf6e1aa09d1e45f40c2 Mon Sep 17 00:00:00 2001 From: Heng Qi Date: Mon, 17 Jun 2024 21:15:23 +0800 Subject: [PATCH 172/272] virtio_net: checksum offloading handling fix In virtio spec 0.95, VIRTIO_NET_F_GUEST_CSUM was designed to handle partially checksummed packets, and the validation of fully checksummed packets by the device is independent of VIRTIO_NET_F_GUEST_CSUM negotiation. However, the specification erroneously stated: "If VIRTIO_NET_F_GUEST_CSUM is not negotiated, the device MUST set flags to zero and SHOULD supply a fully checksummed packet to the driver." This statement is inaccurate because even without VIRTIO_NET_F_GUEST_CSUM negotiation, the device can still set the VIRTIO_NET_HDR_F_DATA_VALID flag. Essentially, the device can facilitate the validation of these packets' checksums - a process known as RX checksum offloading - removing the need for the driver to do so. This scenario is currently not implemented in the driver and requires correction. The necessary specification correction[1] has been made and approved in the virtio TC vote. [1] https://lists.oasis-open.org/archives/virtio-comment/202401/msg00011.html Fixes: 4f49129be6fa ("virtio-net: Set RXCSUM feature if GUEST_CSUM is available") Signed-off-by: Heng Qi Reviewed-by: Jiri Pirko Acked-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 61a57d134544f..aa70a7ed8072c 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -5666,8 +5666,16 @@ static int virtnet_probe(struct virtio_device *vdev) dev->features |= dev->hw_features & NETIF_F_ALL_TSO; /* (!csum && gso) case will be fixed by register_netdev() */ } - if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM)) - dev->features |= NETIF_F_RXCSUM; + + /* 1. With VIRTIO_NET_F_GUEST_CSUM negotiation, the driver doesn't + * need to calculate checksums for partially checksummed packets, + * as they're considered valid by the upper layer. + * 2. Without VIRTIO_NET_F_GUEST_CSUM negotiation, the driver only + * receives fully checksummed packets. The device may assist in + * validating these packets' checksums, so the driver won't have to. + */ + dev->features |= NETIF_F_RXCSUM; + if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) dev->features |= NETIF_F_GRO_HW; From 703eec1b242276f2d97d98f04790ddad319ddde4 Mon Sep 17 00:00:00 2001 From: Heng Qi Date: Mon, 17 Jun 2024 21:15:24 +0800 Subject: [PATCH 173/272] virtio_net: fixing XDP for fully checksummed packets handling The XDP program can't correctly handle partially checksummed packets, but works fine with fully checksummed packets. If the device has already validated fully checksummed packets, then the driver doesn't need to re-validate them, saving CPU resources. Additionally, the driver does not drop all partially checksummed packets when VIRTIO_NET_F_GUEST_CSUM is not negotiated. This is not a bug, as the driver has always done this. Fixes: 436c9453a1ac ("virtio-net: keep vnet header zeroed after processing XDP") Signed-off-by: Heng Qi Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index aa70a7ed8072c..ea10db9a09fa2 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1360,6 +1360,10 @@ static struct sk_buff *receive_small_xdp(struct net_device *dev, if (unlikely(hdr->hdr.gso_type)) goto err_xdp; + /* Partially checksummed packets must be dropped. */ + if (unlikely(hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) + goto err_xdp; + buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); @@ -1677,6 +1681,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, if (unlikely(hdr->hdr.gso_type)) return NULL; + /* Partially checksummed packets must be dropped. */ + if (unlikely(hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) + return NULL; + /* Now XDP core assumes frag size is PAGE_SIZE, but buffers * with headroom may add hole in truesize, which * make their length exceed PAGE_SIZE. So we disabled the @@ -1943,6 +1951,7 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, struct net_device *dev = vi->dev; struct sk_buff *skb; struct virtio_net_common_hdr *hdr; + u8 flags; if (unlikely(len < vi->hdr_len + ETH_HLEN)) { pr_debug("%s: short packet %i\n", dev->name, len); @@ -1951,6 +1960,15 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, return; } + /* 1. Save the flags early, as the XDP program might overwrite them. + * These flags ensure packets marked as VIRTIO_NET_HDR_F_DATA_VALID + * stay valid after XDP processing. + * 2. XDP doesn't work with partially checksummed packets (refer to + * virtnet_xdp_set()), so packets marked as + * VIRTIO_NET_HDR_F_NEEDS_CSUM get dropped during XDP processing. + */ + flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags; + if (vi->mergeable_rx_bufs) skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit, stats); @@ -1966,7 +1984,7 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, if (dev->features & NETIF_F_RXHASH && vi->has_rss_hash_report) virtio_skb_set_hash(&hdr->hash_v1_hdr, skb); - if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID) + if (flags & VIRTIO_NET_HDR_F_DATA_VALID) skb->ip_summed = CHECKSUM_UNNECESSARY; if (virtio_net_hdr_to_skb(skb, &hdr->hdr, From b95a4afe2defd6f46891985f9436a568cd35a31c Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 17 Jun 2024 17:50:26 +0100 Subject: [PATCH 174/272] octeontx2-pf: Add error handling to VLAN unoffload handling otx2_sq_append_skb makes used of __vlan_hwaccel_push_inside() to unoffload VLANs - push them from skb meta data into skb data. However, it omitts a check for __vlan_hwaccel_push_inside() returning NULL. Found by inspection based on [1] and [2]. Compile tested only. [1] Re: [PATCH net-next v1] net: stmmac: Enable TSO on VLANs https://lore.kernel.org/all/ZmrN2W8Fye450TKs@shell.armlinux.org.uk/ [2] Re: [PATCH net-next v2] net: stmmac: Enable TSO on VLANs https://lore.kernel.org/all/CANn89i+11L5=tKsa7V7Aeyxaj6nYGRwy35PAbCRYJ73G+b25sg@mail.gmail.com/ Fixes: fd9d7859db6c ("octeontx2-pf: Implement ingress/egress VLAN offload") Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index a16e9f244117b..929b4eac25d97 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -1174,8 +1174,11 @@ bool otx2_sq_append_skb(struct net_device *netdev, struct otx2_snd_queue *sq, if (skb_shinfo(skb)->gso_size && !is_hw_tso_supported(pfvf, skb)) { /* Insert vlan tag before giving pkt to tso */ - if (skb_vlan_tag_present(skb)) + if (skb_vlan_tag_present(skb)) { skb = __vlan_hwaccel_push_inside(skb); + if (!skb) + return true; + } otx2_sq_append_tso(pfvf, sq, skb, qidx); return true; } From fa997b0576c9df635ee363406f5e014dba0f9264 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 18 Jun 2024 17:28:29 +0200 Subject: [PATCH 175/272] ata: ahci: Do not enable LPM if no LPM states are supported by the HBA LPM consists of HIPM (host initiated power management) and DIPM (device initiated power management). ata_eh_set_lpm() will only enable HIPM if both the HBA and the device supports it. However, DIPM will be enabled as long as the device supports it. The HBA will later reject the device's request to enter a power state that it does not support (Slumber/Partial/DevSleep) (DevSleep is never initiated by the device). For a HBA that doesn't support any LPM states, simply don't set a LPM policy such that all the HIPM/DIPM probing/enabling will be skipped. Not enabling HIPM or DIPM in the first place is safer than relying on the device following the AHCI specification and respecting the NAK. (There are comments in the code that some devices misbehave when receiving a NAK.) Performing this check in ahci_update_initial_lpm_policy() also has the advantage that a HBA that doesn't support any LPM states will take the exact same code paths as a port that is external/hot plug capable. Side note: the port in ata_port_dbg() has not been given a unique id yet, but this is not overly important as the debug print is disabled unless explicitly enabled using dynamic debug. A follow-up series will make sure that the unique id assignment will be done earlier. For now, the important thing is that the function returns before setting the LPM policy. Fixes: 7627a0edef54 ("ata: ahci: Drop low power policy board type") Cc: stable@vger.kernel.org Reviewed-by: Mario Limonciello Reviewed-by: Mika Westerberg Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240618152828.2686771-2-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/ahci.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 07d66d2c5f0dd..5eb38fbbbecdb 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -1735,6 +1735,14 @@ static void ahci_update_initial_lpm_policy(struct ata_port *ap) if (ap->pflags & ATA_PFLAG_EXTERNAL) return; + /* If no LPM states are supported by the HBA, do not bother with LPM */ + if ((ap->host->flags & ATA_HOST_NO_PART) && + (ap->host->flags & ATA_HOST_NO_SSC) && + (ap->host->flags & ATA_HOST_NO_DEVSLP)) { + ata_port_dbg(ap, "no LPM states supported, not enabling LPM\n"); + return; + } + /* user modified policy via module param */ if (mobile_lpm_policy != -1) { policy = mobile_lpm_policy; From 1062d03827b78614259b3b4b992deb27ee6aa84d Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Tue, 18 Jun 2024 11:41:22 +0530 Subject: [PATCH 176/272] octeontx2-pf: Fix linking objects into multiple modules This patch fixes the below build warning messages that are caused due to linking same files to multiple modules by exporting the required symbols. "scripts/Makefile.build:244: drivers/net/ethernet/marvell/octeontx2/nic/Makefile: otx2_devlink.o is added to multiple modules: rvu_nicpf rvu_nicvf scripts/Makefile.build:244: drivers/net/ethernet/marvell/octeontx2/nic/Makefile: otx2_dcbnl.o is added to multiple modules: rvu_nicpf rvu_nicvf" Fixes: 8e67558177f8 ("octeontx2-pf: PFC config support with DCBx"). Signed-off-by: Geetha sowjanya Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/Makefile | 3 +-- drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c | 7 +++++++ drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c | 2 ++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile index 5664f768cb0cd..64a97a0a10ed6 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile +++ b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile @@ -9,10 +9,9 @@ obj-$(CONFIG_OCTEONTX2_VF) += rvu_nicvf.o otx2_ptp.o rvu_nicpf-y := otx2_pf.o otx2_common.o otx2_txrx.o otx2_ethtool.o \ otx2_flows.o otx2_tc.o cn10k.o otx2_dmac_flt.o \ otx2_devlink.o qos_sq.o qos.o -rvu_nicvf-y := otx2_vf.o otx2_devlink.o +rvu_nicvf-y := otx2_vf.o rvu_nicpf-$(CONFIG_DCB) += otx2_dcbnl.o -rvu_nicvf-$(CONFIG_DCB) += otx2_dcbnl.o rvu_nicpf-$(CONFIG_MACSEC) += cn10k_macsec.o ccflags-y += -I$(srctree)/drivers/net/ethernet/marvell/octeontx2/af diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c index 28fb643d2917f..aa01110f04a33 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c @@ -54,6 +54,7 @@ int otx2_pfc_txschq_config(struct otx2_nic *pfvf) return 0; } +EXPORT_SYMBOL(otx2_pfc_txschq_config); static int otx2_pfc_txschq_alloc_one(struct otx2_nic *pfvf, u8 prio) { @@ -122,6 +123,7 @@ int otx2_pfc_txschq_alloc(struct otx2_nic *pfvf) return 0; } +EXPORT_SYMBOL(otx2_pfc_txschq_alloc); static int otx2_pfc_txschq_stop_one(struct otx2_nic *pfvf, u8 prio) { @@ -260,6 +262,7 @@ int otx2_pfc_txschq_update(struct otx2_nic *pfvf) return 0; } +EXPORT_SYMBOL(otx2_pfc_txschq_update); int otx2_pfc_txschq_stop(struct otx2_nic *pfvf) { @@ -282,6 +285,7 @@ int otx2_pfc_txschq_stop(struct otx2_nic *pfvf) return 0; } +EXPORT_SYMBOL(otx2_pfc_txschq_stop); int otx2_config_priority_flow_ctrl(struct otx2_nic *pfvf) { @@ -321,6 +325,7 @@ int otx2_config_priority_flow_ctrl(struct otx2_nic *pfvf) mutex_unlock(&pfvf->mbox.lock); return err; } +EXPORT_SYMBOL(otx2_config_priority_flow_ctrl); void otx2_update_bpid_in_rqctx(struct otx2_nic *pfvf, int vlan_prio, int qidx, bool pfc_enable) @@ -385,6 +390,7 @@ void otx2_update_bpid_in_rqctx(struct otx2_nic *pfvf, int vlan_prio, int qidx, "Updating BPIDs in CQ and Aura contexts of RQ%d failed with err %d\n", qidx, err); } +EXPORT_SYMBOL(otx2_update_bpid_in_rqctx); static int otx2_dcbnl_ieee_getpfc(struct net_device *dev, struct ieee_pfc *pfc) { @@ -472,3 +478,4 @@ int otx2_dcbnl_set_ops(struct net_device *dev) return 0; } +EXPORT_SYMBOL(otx2_dcbnl_set_ops); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c index 99ddf31269d96..458d34a62e189 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c @@ -113,6 +113,7 @@ int otx2_register_dl(struct otx2_nic *pfvf) devlink_free(dl); return err; } +EXPORT_SYMBOL(otx2_register_dl); void otx2_unregister_dl(struct otx2_nic *pfvf) { @@ -124,3 +125,4 @@ void otx2_unregister_dl(struct otx2_nic *pfvf) ARRAY_SIZE(otx2_dl_params)); devlink_free(dl); } +EXPORT_SYMBOL(otx2_unregister_dl); From a8763466669d21b570b26160d0a5e0a2ee529d22 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Tue, 18 Jun 2024 09:29:21 +0200 Subject: [PATCH 177/272] selftests: openvswitch: Set value to nla flags. Netlink flags, although they don't have payload at the netlink level, are represented as having "True" as value in pyroute2. Without it, trying to add a flow with a flag-type action (e.g: pop_vlan) fails with the following traceback: Traceback (most recent call last): File "[...]/ovs-dpctl.py", line 2498, in sys.exit(main(sys.argv)) ^^^^^^^^^^^^^^ File "[...]/ovs-dpctl.py", line 2487, in main ovsflow.add_flow(rep["dpifindex"], flow) File "[...]/ovs-dpctl.py", line 2136, in add_flow reply = self.nlm_request( ^^^^^^^^^^^^^^^^^ File "[...]/pyroute2/netlink/nlsocket.py", line 822, in nlm_request return tuple(self._genlm_request(*argv, **kwarg)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "[...]/pyroute2/netlink/generic/__init__.py", line 126, in nlm_request return tuple(super().nlm_request(*argv, **kwarg)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "[...]/pyroute2/netlink/nlsocket.py", line 1124, in nlm_request self.put(msg, msg_type, msg_flags, msg_seq=msg_seq) File "[...]/pyroute2/netlink/nlsocket.py", line 389, in put self.sendto_gate(msg, addr) File "[...]/pyroute2/netlink/nlsocket.py", line 1056, in sendto_gate msg.encode() File "[...]/pyroute2/netlink/__init__.py", line 1245, in encode offset = self.encode_nlas(offset) ^^^^^^^^^^^^^^^^^^^^^^^^ File "[...]/pyroute2/netlink/__init__.py", line 1560, in encode_nlas nla_instance.setvalue(cell[1]) File "[...]/pyroute2/netlink/__init__.py", line 1265, in setvalue nlv.setvalue(nla_tuple[1]) ~~~~~~~~~^^^ IndexError: list index out of range Signed-off-by: Adrian Moreno Acked-by: Aaron Conole Signed-off-by: David S. Miller --- tools/testing/selftests/net/openvswitch/ovs-dpctl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py index 1dd057afd3fbe..9f8dec2f6539c 100644 --- a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py +++ b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py @@ -531,7 +531,7 @@ def parse(self, actstr): for flat_act in parse_flat_map: if parse_starts_block(actstr, flat_act[0], False): actstr = actstr[len(flat_act[0]):] - self["attrs"].append([flat_act[1]]) + self["attrs"].append([flat_act[1], True]) actstr = actstr[strspn(actstr, ", ") :] parsed = True From df75470b317b46affbe1f5f8f006b34175be9789 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 18 Jun 2024 19:34:18 +0200 Subject: [PATCH 178/272] spi: spi-imx: imx51: revert burst length calculation back to bits_per_word The patch 15a6af94a277 ("spi: Increase imx51 ecspi burst length based on transfer length") increased the burst length calculation in mx51_ecspi_prepare_transfer() to be based on the transfer length. This breaks HW CS + SPI_CS_WORD support which was added in 6e95b23a5b2d ("spi: imx: Implement support for CS_WORD") and transfers with bits-per-word != 8, 16, 32. SPI_CS_WORD means the CS should be toggled after each word. The implementation in the imx-spi driver relies on the fact that the HW CS is toggled automatically by the controller after each burst length number of bits. Setting the burst length to the number of bits of the _whole_ message breaks this use case. Further the patch 15a6af94a277 ("spi: Increase imx51 ecspi burst length based on transfer length") claims to optimize the transfers. But even without this patch, on modern spi-imx controllers with "dynamic_burst = true" (imx51, imx6 and newer), the transfers are already optimized, i.e. the burst length is dynamically adjusted in spi_imx_push() to avoid the pause between the SPI bursts. This has been confirmed by a scope measurement on an imx6d. Subsequent Patches tried to fix these and other problems: - 5f66db08cbd3 ("spi: imx: Take in account bits per word instead of assuming 8-bits") - e9b220aeacf1 ("spi: spi-imx: correctly configure burst length when using dma") - c712c05e46c8 ("spi: imx: fix the burst length at DMA mode and CPU mode") - cf6d79a0f576 ("spi: spi-imx: fix off-by-one in mx51 CPU mode burst length") but the HW CS + SPI_CS_WORD use case is still broken. To fix the problems revert the burst size calculation in mx51_ecspi_prepare_transfer() back to the original form, before 15a6af94a277 ("spi: Increase imx51 ecspi burst length based on transfer length") was applied. Cc: Stefan Moring Cc: Stefan Bigler Cc: Clark Wang Cc: Carlos Song Cc: Sebastian Reichel Cc: Thorsten Scherer Fixes: 15a6af94a277 ("spi: Increase imx51 ecspi burst length based on transfer length") Fixes: 5f66db08cbd3 ("spi: imx: Take in account bits per word instead of assuming 8-bits") Fixes: e9b220aeacf1 ("spi: spi-imx: correctly configure burst length when using dma") Fixes: c712c05e46c8 ("spi: imx: fix the burst length at DMA mode and CPU mode") Fixes: cf6d79a0f576 ("spi: spi-imx: fix off-by-one in mx51 CPU mode burst length") Link: https://lore.kernel.org/all/20240618-oxpecker-of-ideal-mastery-db59f8-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde Tested-by: Thorsten Scherer Link: https://msgid.link/r/20240618-spi-imx-fix-bustlength-v1-1-2053dd5fdf87@pengutronix.de Signed-off-by: Mark Brown --- drivers/spi/spi-imx.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index f4006c82f867a..33164ebdb5831 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -660,18 +660,8 @@ static int mx51_ecspi_prepare_transfer(struct spi_imx_data *spi_imx, ctrl |= (spi_imx->target_burst * 8 - 1) << MX51_ECSPI_CTRL_BL_OFFSET; else { - if (spi_imx->usedma) { - ctrl |= (spi_imx->bits_per_word - 1) - << MX51_ECSPI_CTRL_BL_OFFSET; - } else { - if (spi_imx->count >= MX51_ECSPI_CTRL_MAX_BURST) - ctrl |= (MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1) - << MX51_ECSPI_CTRL_BL_OFFSET; - else - ctrl |= (spi_imx->count / DIV_ROUND_UP(spi_imx->bits_per_word, - BITS_PER_BYTE) * spi_imx->bits_per_word - 1) - << MX51_ECSPI_CTRL_BL_OFFSET; - } + ctrl |= (spi_imx->bits_per_word - 1) + << MX51_ECSPI_CTRL_BL_OFFSET; } /* set clock speed */ From 8ecd06277a7664f4ef018abae3abd3451d64e7a6 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 17 Jun 2024 11:18:15 +0200 Subject: [PATCH 179/272] netfilter: ipset: Fix suspicious rcu_dereference_protected() When destroying all sets, we are either in pernet exit phase or are executing a "destroy all sets command" from userspace. The latter was taken into account in ip_set_dereference() (nfnetlink mutex is held), but the former was not. The patch adds the required check to rcu_dereference_protected() in ip_set_dereference(). Fixes: 4e7aaa6b82d6 ("netfilter: ipset: Fix race between namespace cleanup and gc in the list:set type") Reported-by: syzbot+b62c37cdd58103293a5a@syzkaller.appspotmail.com Reported-by: syzbot+cfbe1da5fdfc39efc293@syzkaller.appspotmail.com Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202406141556.e0b6f17e-lkp@intel.com Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index c7ae4d9bf3d24..61431690cbd5f 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -53,12 +53,13 @@ MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); /* When the nfnl mutex or ip_set_ref_lock is held: */ -#define ip_set_dereference(p) \ - rcu_dereference_protected(p, \ +#define ip_set_dereference(inst) \ + rcu_dereference_protected((inst)->ip_set_list, \ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ - lockdep_is_held(&ip_set_ref_lock)) + lockdep_is_held(&ip_set_ref_lock) || \ + (inst)->is_deleted) #define ip_set(inst, id) \ - ip_set_dereference((inst)->ip_set_list)[id] + ip_set_dereference(inst)[id] #define ip_set_ref_netlink(inst,id) \ rcu_dereference_raw((inst)->ip_set_list)[id] #define ip_set_dereference_nfnl(p) \ @@ -1133,7 +1134,7 @@ static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ - tmp = ip_set_dereference(inst->ip_set_list); + tmp = ip_set_dereference(inst); memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max); rcu_assign_pointer(inst->ip_set_list, list); /* Make sure all current packets have passed through */ From 9a3bc8d16e0aacd65c31aaf23a2bced3288a7779 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 13 Jun 2024 17:42:46 +0800 Subject: [PATCH 180/272] seg6: fix parameter passing when calling NF_HOOK() in End.DX4 and End.DX6 behaviors input_action_end_dx4() and input_action_end_dx6() are called NF_HOOK() for PREROUTING hook, in PREROUTING hook, we should passing a valid indev, and a NULL outdev to NF_HOOK(), otherwise may trigger a NULL pointer dereference, as below: [74830.647293] BUG: kernel NULL pointer dereference, address: 0000000000000090 [74830.655633] #PF: supervisor read access in kernel mode [74830.657888] #PF: error_code(0x0000) - not-present page [74830.659500] PGD 0 P4D 0 [74830.660450] Oops: 0000 [#1] PREEMPT SMP PTI ... [74830.664953] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [74830.666569] RIP: 0010:rpfilter_mt+0x44/0x15e [ipt_rpfilter] ... [74830.689725] Call Trace: [74830.690402] [74830.690953] ? show_trace_log_lvl+0x1c4/0x2df [74830.692020] ? show_trace_log_lvl+0x1c4/0x2df [74830.693095] ? ipt_do_table+0x286/0x710 [ip_tables] [74830.694275] ? __die_body.cold+0x8/0xd [74830.695205] ? page_fault_oops+0xac/0x140 [74830.696244] ? exc_page_fault+0x62/0x150 [74830.697225] ? asm_exc_page_fault+0x22/0x30 [74830.698344] ? rpfilter_mt+0x44/0x15e [ipt_rpfilter] [74830.699540] ipt_do_table+0x286/0x710 [ip_tables] [74830.700758] ? ip6_route_input+0x19d/0x240 [74830.701752] nf_hook_slow+0x3f/0xb0 [74830.702678] input_action_end_dx4+0x19b/0x1e0 [74830.703735] ? input_action_end_t+0xe0/0xe0 [74830.704734] seg6_local_input_core+0x2d/0x60 [74830.705782] lwtunnel_input+0x5b/0xb0 [74830.706690] __netif_receive_skb_one_core+0x63/0xa0 [74830.707825] process_backlog+0x99/0x140 [74830.709538] __napi_poll+0x2c/0x160 [74830.710673] net_rx_action+0x296/0x350 [74830.711860] __do_softirq+0xcb/0x2ac [74830.713049] do_softirq+0x63/0x90 input_action_end_dx4() passing a NULL indev to NF_HOOK(), and finally trigger a NULL dereference in rpfilter_mt()->rpfilter_is_loopback(): static bool rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in) { // in is NULL return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } Fixes: 7a3f5b0de364 ("netfilter: add netfilter hooks to SRv6 data plane") Signed-off-by: Jianguo Wu Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/ipv6/seg6_local.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 24e2b4b494cb0..c434940131b1d 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -941,8 +941,8 @@ static int input_action_end_dx6(struct sk_buff *skb, if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, - dev_net(skb->dev), NULL, skb, NULL, - skb_dst(skb)->dev, input_action_end_dx6_finish); + dev_net(skb->dev), NULL, skb, skb->dev, + NULL, input_action_end_dx6_finish); return input_action_end_dx6_finish(dev_net(skb->dev), NULL, skb); drop: @@ -991,8 +991,8 @@ static int input_action_end_dx4(struct sk_buff *skb, if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, - dev_net(skb->dev), NULL, skb, NULL, - skb_dst(skb)->dev, input_action_end_dx4_finish); + dev_net(skb->dev), NULL, skb, skb->dev, + NULL, input_action_end_dx4_finish); return input_action_end_dx4_finish(dev_net(skb->dev), NULL, skb); drop: From 096597cfe4ea08b1830e775436d76d7c9d6d3037 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 18 Jun 2024 21:44:24 -0700 Subject: [PATCH 181/272] thermal: int340x: processor_thermal: Support shared interrupts On some systems the processor thermal device interrupt is shared with other PCI devices. In this case return IRQ_NONE from the interrupt handler when the interrupt is not for the processor thermal device. Signed-off-by: Srinivas Pandruvada Fixes: f0658708e863 ("thermal: int340x: processor_thermal: Use non MSI interrupts by default") Cc: 6.7+ # 6.7+ Signed-off-by: Rafael J. Wysocki --- .../intel/int340x_thermal/processor_thermal_device_pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c index 14e34eabc4191..4a1bfebb1b8e5 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c @@ -150,7 +150,7 @@ static irqreturn_t proc_thermal_irq_handler(int irq, void *devid) { struct proc_thermal_pci *pci_info = devid; struct proc_thermal_device *proc_priv; - int ret = IRQ_HANDLED; + int ret = IRQ_NONE; u32 status; proc_priv = pci_info->proc_priv; @@ -175,6 +175,7 @@ static irqreturn_t proc_thermal_irq_handler(int irq, void *devid) /* Disable enable interrupt flag */ proc_thermal_mmio_write(pci_info, PROC_THERMAL_MMIO_INT_ENABLE_0, 0); pkg_thermal_schedule_work(&pci_info->work); + ret = IRQ_HANDLED; } pci_write_config_byte(pci_info->pdev, 0xdc, 0x01); From a2225e0250c5fa397dcebf6ce65a9f05a114e0cf Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 13 Jun 2024 17:42:47 +0800 Subject: [PATCH 182/272] netfilter: move the sysctl nf_hooks_lwtunnel into the netfilter core Currently, the sysctl net.netfilter.nf_hooks_lwtunnel depends on the nf_conntrack module, but the nf_conntrack module is not always loaded. Therefore, accessing net.netfilter.nf_hooks_lwtunnel may have an error. Move sysctl nf_hooks_lwtunnel into the netfilter core. Fixes: 7a3f5b0de364 ("netfilter: add netfilter hooks to SRv6 data plane") Suggested-by: Pablo Neira Ayuso Signed-off-by: Jianguo Wu Signed-off-by: Pablo Neira Ayuso --- include/net/netns/netfilter.h | 3 ++ net/netfilter/core.c | 13 ++++- net/netfilter/nf_conntrack_standalone.c | 15 ------ net/netfilter/nf_hooks_lwtunnel.c | 67 +++++++++++++++++++++++++ net/netfilter/nf_internals.h | 6 +++ 5 files changed, 87 insertions(+), 17 deletions(-) diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 02bbdc577f8e2..a6a0bf4a247e5 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -15,6 +15,9 @@ struct netns_nf { const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO]; #ifdef CONFIG_SYSCTL struct ctl_table_header *nf_log_dir_header; +#ifdef CONFIG_LWTUNNEL + struct ctl_table_header *nf_lwtnl_dir_header; +#endif #endif struct nf_hook_entries __rcu *hooks_ipv4[NF_INET_NUMHOOKS]; struct nf_hook_entries __rcu *hooks_ipv6[NF_INET_NUMHOOKS]; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 3126911f50425..b00fc285b3349 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -815,12 +815,21 @@ int __init netfilter_init(void) if (ret < 0) goto err; +#ifdef CONFIG_LWTUNNEL + ret = netfilter_lwtunnel_init(); + if (ret < 0) + goto err_lwtunnel_pernet; +#endif ret = netfilter_log_init(); if (ret < 0) - goto err_pernet; + goto err_log_pernet; return 0; -err_pernet: +err_log_pernet: +#ifdef CONFIG_LWTUNNEL + netfilter_lwtunnel_fini(); +err_lwtunnel_pernet: +#endif unregister_pernet_subsys(&netfilter_net_ops); err: return ret; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 74112e9c5dabc..6c40bdf8b05ab 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -22,9 +22,6 @@ #include #include #include -#ifdef CONFIG_LWTUNNEL -#include -#endif #include static bool enable_hooks __read_mostly; @@ -612,9 +609,6 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_GRE, NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM, #endif -#ifdef CONFIG_LWTUNNEL - NF_SYSCTL_CT_LWTUNNEL, -#endif NF_SYSCTL_CT_LAST_SYSCTL, }; @@ -946,15 +940,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .proc_handler = proc_dointvec_jiffies, }, #endif -#ifdef CONFIG_LWTUNNEL - [NF_SYSCTL_CT_LWTUNNEL] = { - .procname = "nf_hooks_lwtunnel", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = nf_hooks_lwtunnel_sysctl_handler, - }, -#endif }; static struct ctl_table nf_ct_netfilter_table[] = { diff --git a/net/netfilter/nf_hooks_lwtunnel.c b/net/netfilter/nf_hooks_lwtunnel.c index 00e89ffd78f69..7cdb59bb4459f 100644 --- a/net/netfilter/nf_hooks_lwtunnel.c +++ b/net/netfilter/nf_hooks_lwtunnel.c @@ -3,6 +3,9 @@ #include #include #include +#include + +#include "nf_internals.h" static inline int nf_hooks_lwtunnel_get(void) { @@ -50,4 +53,68 @@ int nf_hooks_lwtunnel_sysctl_handler(struct ctl_table *table, int write, return ret; } EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_sysctl_handler); + +static struct ctl_table nf_lwtunnel_sysctl_table[] = { + { + .procname = "nf_hooks_lwtunnel", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = nf_hooks_lwtunnel_sysctl_handler, + }, +}; + +static int __net_init nf_lwtunnel_net_init(struct net *net) +{ + struct ctl_table_header *hdr; + struct ctl_table *table; + + table = nf_lwtunnel_sysctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(nf_lwtunnel_sysctl_table, + sizeof(nf_lwtunnel_sysctl_table), + GFP_KERNEL); + if (!table) + goto err_alloc; + } + + hdr = register_net_sysctl_sz(net, "net/netfilter", table, + ARRAY_SIZE(nf_lwtunnel_sysctl_table)); + if (!hdr) + goto err_reg; + + net->nf.nf_lwtnl_dir_header = hdr; + + return 0; +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void __net_exit nf_lwtunnel_net_exit(struct net *net) +{ + const struct ctl_table *table; + + table = net->nf.nf_lwtnl_dir_header->ctl_table_arg; + unregister_net_sysctl_table(net->nf.nf_lwtnl_dir_header); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static struct pernet_operations nf_lwtunnel_net_ops = { + .init = nf_lwtunnel_net_init, + .exit = nf_lwtunnel_net_exit, +}; + +int __init netfilter_lwtunnel_init(void) +{ + return register_pernet_subsys(&nf_lwtunnel_net_ops); +} + +void netfilter_lwtunnel_fini(void) +{ + unregister_pernet_subsys(&nf_lwtunnel_net_ops); +} #endif /* CONFIG_SYSCTL */ diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 832ae64179f0f..25403023060b6 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -29,6 +29,12 @@ void nf_queue_nf_hook_drop(struct net *net); /* nf_log.c */ int __init netfilter_log_init(void); +#ifdef CONFIG_LWTUNNEL +/* nf_hooks_lwtunnel.c */ +int __init netfilter_lwtunnel_init(void); +void netfilter_lwtunnel_fini(void); +#endif + /* core.c */ void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp, const struct nf_hook_ops *reg); From 72e50ef99431163203657128050d0697caf3321d Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 13 Jun 2024 17:42:48 +0800 Subject: [PATCH 183/272] selftests: add selftest for the SRv6 End.DX4 behavior with netfilter this selftest is designed for evaluating the SRv6 End.DX4 behavior used with netfilter(rpfilter), in this example, for implementing IPv4 L3 VPN use cases. Signed-off-by: Jianguo Wu Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/config | 1 + .../net/srv6_end_dx4_netfilter_test.sh | 335 ++++++++++++++++++ 3 files changed, 337 insertions(+) create mode 100755 tools/testing/selftests/net/srv6_end_dx4_netfilter_test.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index bd01e4a0be2c2..7a5f7dd320deb 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -43,6 +43,7 @@ TEST_PROGS += srv6_hl2encap_red_l2vpn_test.sh TEST_PROGS += srv6_end_next_csid_l3vpn_test.sh TEST_PROGS += srv6_end_x_next_csid_l3vpn_test.sh TEST_PROGS += srv6_end_flavors_test.sh +TEST_PROGS += srv6_end_dx4_netfilter_test.sh TEST_PROGS += vrf_strict_mode_test.sh TEST_PROGS += arp_ndisc_evict_nocarrier.sh TEST_PROGS += ndisc_unsolicited_na_test.sh diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 04de7a6ba6f31..c2766e558f925 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -101,3 +101,4 @@ CONFIG_NETFILTER_XT_MATCH_POLICY=m CONFIG_CRYPTO_ARIA=y CONFIG_XFRM_INTERFACE=m CONFIG_XFRM_USER=m +CONFIG_IP_NF_MATCH_RPFILTER=m diff --git a/tools/testing/selftests/net/srv6_end_dx4_netfilter_test.sh b/tools/testing/selftests/net/srv6_end_dx4_netfilter_test.sh new file mode 100755 index 0000000000000..e23210aa547fd --- /dev/null +++ b/tools/testing/selftests/net/srv6_end_dx4_netfilter_test.sh @@ -0,0 +1,335 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# author: Jianguo Wu +# +# Mostly copied from tools/testing/selftests/net/srv6_end_dt4_l3vpn_test.sh. +# +# This script is designed for testing the support of netfilter hooks for +# SRv6 End.DX4 behavior. +# +# Hereafter a network diagram is shown, where one tenants (named 100) offer +# IPv4 L3 VPN services allowing hosts to communicate with each other across +# an IPv6 network. +# +# Routers rt-1 and rt-2 implement IPv4 L3 VPN services leveraging the SRv6 +# architecture. The key components for such VPNs are: a) SRv6 Encap behavior, +# b) SRv6 End.DX4 behavior. +# +# To explain how an IPv4 L3 VPN based on SRv6 works, let us briefly consider an +# example where, within the same domain of tenant 100, the host hs-1 pings +# the host hs-2. +# +# First of all, L2 reachability of the host hs-2 is taken into account by +# the router rt-1 which acts as an arp proxy. +# +# When the host hs-1 sends an IPv4 packet destined to hs-2, the router rt-1 +# receives the packet on the internal veth-t100 interface, rt-1 contains the +# SRv6 Encap route for encapsulating the IPv4 packet in a IPv6 plus the Segment +# Routing Header (SRH) packet. This packet is sent through the (IPv6) core +# network up to the router rt-2 that receives it on veth0 interface. +# +# The rt-2 router uses the 'localsid' routing table to process incoming +# IPv6+SRH packets which belong to the VPN of the tenant 100. For each of these +# packets, the SRv6 End.DX4 behavior removes the outer IPv6+SRH headers and +# routs the packet to the specified nexthop. Afterwards, the packet is sent to +# the host hs-2 through the veth-t100 interface. +# +# The ping response follows the same processing but this time the role of rt-1 +# and rt-2 are swapped. +# +# And when net.netfilter.nf_hooks_lwtunnel is set to 1 in rt-1 or rt-2, and a +# rpfilter iptables rule is added, SRv6 packets will go through netfilter PREROUTING +# hooks. +# +# +# +-------------------+ +-------------------+ +# | | | | +# | hs-1 netns | | hs-2 netns | +# | | | | +# | +-------------+ | | +-------------+ | +# | | veth0 | | | | veth0 | | +# | | 10.0.0.1/24 | | | | 10.0.0.2/24 | | +# | +-------------+ | | +-------------+ | +# | . | | . | +# +-------------------+ +-------------------+ +# . . +# . . +# . . +# +-----------------------------------+ +-----------------------------------+ +# | . | | . | +# | +---------------+ | | +---------------- | +# | | veth-t100 | | | | veth-t100 | | +# | | 10.0.0.11/24 | +----------+ | | +----------+ | 10.0.0.22/24 | | +# | +-------+-------+ | route | | | | route | +-------+-------- | +# | | table | | | | table | | +# | +----------+ | | +----------+ | +# | +--------------+ | | +--------------+ | +# | | veth0 | | | | veth0 | | +# | | 2001:11::1/64 |.|...|.| 2001:11::2/64 | | +# | +--------------+ | | +--------------+ | +# | | | | +# | rt-1 netns | | rt-2 netns | +# | | | | +# +-----------------------------------+ +-----------------------------------+ +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# | Network configuration | +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# rt-1: localsid table +# +----------------------------------------------------------------+ +# |SID |Action | +# +----------------------------------------------------------------+ +# |fc00:21:100::6004|apply SRv6 End.DX4 nh4 10.0.0.1 dev veth-t100 | +# +----------------------------------------------------------------+ +# +# rt-1: route table +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |10.0.0.2 |apply seg6 encap segs fc00:12:100::6004| +# +---------------------------------------------------+ +# |10.0.0.0/24|forward to dev veth_t100 | +# +---------------------------------------------------+ +# +# +# rt-2: localsid table +# +---------------------------------------------------------------+ +# |SID |Action | +# +---------------------------------------------------------------+ +# |fc00:12:100::6004|apply SRv6 End.DX4 nh4 10.0.0.2 dev veth-t100| +# +---------------------------------------------------------------+ +# +# rt-2: route table +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |10.0.0.1 |apply seg6 encap segs fc00:21:100::6004| +# +---------------------------------------------------+ +# |10.0.0.0/24|forward to dev veth_t100 | +# +---------------------------------------------------+ +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +readonly IPv6_RT_NETWORK=2001:11 +readonly IPv4_HS_NETWORK=10.0.0 +readonly SID_LOCATOR=fc00 + +PING_TIMEOUT_SEC=4 + +ret=0 + +PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + nsuccess=$((nsuccess+1)) + printf "\n TEST: %-60s [ OK ]\n" "${msg}" + else + ret=1 + nfail=$((nfail+1)) + printf "\n TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +print_log_test_results() +{ + if [ "$TESTS" != "none" ]; then + printf "\nTests passed: %3d\n" ${nsuccess} + printf "Tests failed: %3d\n" ${nfail} + fi +} + +log_section() +{ + echo + echo "################################################################################" + echo "TEST SECTION: $*" + echo "################################################################################" +} + +cleanup() +{ + ip link del veth-rt-1 2>/dev/null || true + ip link del veth-rt-2 2>/dev/null || true + + # destroy routers rt-* and hosts hs-* + for ns in $(ip netns show | grep -E 'rt-*|hs-*'); do + ip netns del ${ns} || true + done +} + +# Setup the basic networking for the routers +setup_rt_networking() +{ + local rt=$1 + local nsname=rt-${rt} + + ip netns add ${nsname} + + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.accept_dad=0 + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.default.accept_dad=0 + + ip link set veth-rt-${rt} netns ${nsname} + ip -netns ${nsname} link set veth-rt-${rt} name veth0 + + ip -netns ${nsname} addr add ${IPv6_RT_NETWORK}::${rt}/64 dev veth0 nodad + ip -netns ${nsname} link set veth0 up + ip -netns ${nsname} link set lo up + + ip netns exec ${nsname} sysctl -wq net.ipv4.ip_forward=1 + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.forwarding=1 +} + +setup_rt_netfilter() +{ + local rt=$1 + local nsname=rt-${rt} + + ip netns exec ${nsname} sysctl -wq net.netfilter.nf_hooks_lwtunnel=1 + ip netns exec ${nsname} iptables -t raw -A PREROUTING -m rpfilter --invert -j DROP +} + +setup_hs() +{ + local hs=$1 + local rt=$2 + local tid=$3 + local hsname=hs-${hs} + local rtname=rt-${rt} + local rtveth=veth-t${tid} + + # set the networking for the host + ip netns add ${hsname} + + ip -netns ${hsname} link add veth0 type veth peer name ${rtveth} + ip -netns ${hsname} link set ${rtveth} netns ${rtname} + ip -netns ${hsname} addr add ${IPv4_HS_NETWORK}.${hs}/24 dev veth0 + ip -netns ${hsname} link set veth0 up + ip -netns ${hsname} link set lo up + + ip -netns ${rtname} addr add ${IPv4_HS_NETWORK}.${rt}${hs}/24 dev ${rtveth} + ip -netns ${rtname} link set ${rtveth} up + + ip netns exec ${rtname} sysctl -wq net.ipv4.conf.${rtveth}.proxy_arp=1 +} + +setup_vpn_config() +{ + local hssrc=$1 + local rtsrc=$2 + local hsdst=$3 + local rtdst=$4 + local tid=$5 + + local hssrc_name=hs-t${tid}-${hssrc} + local hsdst_name=hs-t${tid}-${hsdst} + local rtsrc_name=rt-${rtsrc} + local rtdst_name=rt-${rtdst} + local vpn_sid=${SID_LOCATOR}:${hssrc}${hsdst}:${tid}::6004 + + # set the encap route for encapsulating packets which arrive from the + # host hssrc and destined to the access router rtsrc. + ip -netns ${rtsrc_name} -4 route add ${IPv4_HS_NETWORK}.${hsdst}/32 \ + encap seg6 mode encap segs ${vpn_sid} dev veth0 + ip -netns ${rtsrc_name} -6 route add ${vpn_sid}/128 \ + via 2001:11::${rtdst} dev veth0 + + # set the decap route for decapsulating packets which arrive from + # the rtdst router and destined to the hsdst host. + ip -netns ${rtdst_name} -6 route add ${vpn_sid}/128 \ + encap seg6local action End.DX4 nh4 ${IPv4_HS_NETWORK}.${hsdst} dev veth-t${tid} +} + +setup() +{ + ip link add veth-rt-1 type veth peer name veth-rt-2 + # setup the networking for router rt-1 and router rt-2 + setup_rt_networking 1 + setup_rt_networking 2 + + # setup two hosts for the tenant 100. + # - host hs-1 is directly connected to the router rt-1; + # - host hs-2 is directly connected to the router rt-2. + setup_hs 1 1 100 + setup_hs 2 2 100 + + # setup the IPv4 L3 VPN which connects the host hs-1 and host hs-2. + setup_vpn_config 1 1 2 2 100 #args: src_host src_router dst_host dst_router tenant + setup_vpn_config 2 2 1 1 100 +} + +check_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + ip netns exec hs-${hssrc} ping -c 1 -W ${PING_TIMEOUT_SEC} \ + ${IPv4_HS_NETWORK}.${hsdst} >/dev/null 2>&1 +} + +check_and_log_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + check_hs_connectivity ${hssrc} ${hsdst} ${tid} + log_test $? 0 "Hosts connectivity: hs-${hssrc} -> hs-${hsdst} (tenant ${tid})" +} + +host_tests() +{ + log_section "SRv6 VPN connectivity test among hosts in the same tenant" + + check_and_log_hs_connectivity 1 2 100 + check_and_log_hs_connectivity 2 1 100 +} + +router_netfilter_tests() +{ + log_section "SRv6 VPN connectivity test with netfilter enabled in routers" + setup_rt_netfilter 1 + setup_rt_netfilter 2 + + check_and_log_hs_connectivity 1 2 100 + check_and_log_hs_connectivity 2 1 100 +} + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +cleanup &>/dev/null + +setup + +host_tests +router_netfilter_tests + +print_log_test_results + +cleanup &>/dev/null + +exit ${ret} From 221200ffeb065c6bbd196760c168b42305961655 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 13 Jun 2024 17:42:49 +0800 Subject: [PATCH 184/272] selftests: add selftest for the SRv6 End.DX6 behavior with netfilter this selftest is designed for evaluating the SRv6 End.DX6 behavior used with netfilter(rpfilter), in this example, for implementing IPv6 L3 VPN use cases. Signed-off-by: Jianguo Wu Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/config | 1 + .../net/srv6_end_dx6_netfilter_test.sh | 340 ++++++++++++++++++ 3 files changed, 342 insertions(+) create mode 100755 tools/testing/selftests/net/srv6_end_dx6_netfilter_test.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 7a5f7dd320deb..d9393569d03a4 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -44,6 +44,7 @@ TEST_PROGS += srv6_end_next_csid_l3vpn_test.sh TEST_PROGS += srv6_end_x_next_csid_l3vpn_test.sh TEST_PROGS += srv6_end_flavors_test.sh TEST_PROGS += srv6_end_dx4_netfilter_test.sh +TEST_PROGS += srv6_end_dx6_netfilter_test.sh TEST_PROGS += vrf_strict_mode_test.sh TEST_PROGS += arp_ndisc_evict_nocarrier.sh TEST_PROGS += ndisc_unsolicited_na_test.sh diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index c2766e558f925..d4891f7a2bfae 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -102,3 +102,4 @@ CONFIG_CRYPTO_ARIA=y CONFIG_XFRM_INTERFACE=m CONFIG_XFRM_USER=m CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RPFILTER=m diff --git a/tools/testing/selftests/net/srv6_end_dx6_netfilter_test.sh b/tools/testing/selftests/net/srv6_end_dx6_netfilter_test.sh new file mode 100755 index 0000000000000..9e69a2ed5bc34 --- /dev/null +++ b/tools/testing/selftests/net/srv6_end_dx6_netfilter_test.sh @@ -0,0 +1,340 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# author: Jianguo Wu +# +# Mostly copied from tools/testing/selftests/net/srv6_end_dt6_l3vpn_test.sh. +# +# This script is designed for testing the support of netfilter hooks for +# SRv6 End.DX4 behavior. +# +# Hereafter a network diagram is shown, where one tenants (named 100) offer +# IPv6 L3 VPN services allowing hosts to communicate with each other across +# an IPv6 network. +# +# Routers rt-1 and rt-2 implement IPv6 L3 VPN services leveraging the SRv6 +# architecture. The key components for such VPNs are: a) SRv6 Encap behavior, +# b) SRv6 End.DX4 behavior. +# +# To explain how an IPv6 L3 VPN based on SRv6 works, let us briefly consider an +# example where, within the same domain of tenant 100, the host hs-1 pings +# the host hs-2. +# +# First of all, L2 reachability of the host hs-2 is taken into account by +# the router rt-1 which acts as an arp proxy. +# +# When the host hs-1 sends an IPv6 packet destined to hs-2, the router rt-1 +# receives the packet on the internal veth-t100 interface, rt-1 contains the +# SRv6 Encap route for encapsulating the IPv6 packet in a IPv6 plus the Segment +# Routing Header (SRH) packet. This packet is sent through the (IPv6) core +# network up to the router rt-2 that receives it on veth0 interface. +# +# The rt-2 router uses the 'localsid' routing table to process incoming +# IPv6+SRH packets which belong to the VPN of the tenant 100. For each of these +# packets, the SRv6 End.DX4 behavior removes the outer IPv6+SRH headers and +# routs the packet to the specified nexthop. Afterwards, the packet is sent to +# the host hs-2 through the veth-t100 interface. +# +# The ping response follows the same processing but this time the role of rt-1 +# and rt-2 are swapped. +# +# And when net.netfilter.nf_hooks_lwtunnel is set to 1 in rt-1 or rt-2, and a +# rpfilter iptables rule is added, SRv6 packets will go through netfilter PREROUTING +# hooks. +# +# +# +-------------------+ +-------------------+ +# | | | | +# | hs-1 netns | | hs-2 netns | +# | | | | +# | +-------------+ | | +-------------+ | +# | | veth0 | | | | veth0 | | +# | | cafe::1/64 | | | | cafe::2/64 | | +# | +-------------+ | | +-------------+ | +# | . | | . | +# +-------------------+ +-------------------+ +# . . +# . . +# . . +# +-----------------------------------+ +-----------------------------------+ +# | . | | . | +# | +---------------+ | | +---------------- | +# | | veth-t100 | | | | veth-t100 | | +# | | cafe::11/64 | +----------+ | | +----------+ | cafe::22/64 | | +# | +-------+-------+ | route | | | | route | +-------+-------- | +# | | table | | | | table | | +# | +----------+ | | +----------+ | +# | +--------------+ | | +--------------+ | +# | | veth0 | | | | veth0 | | +# | | 2001:11::1/64 |.|...|.| 2001:11::2/64 | | +# | +--------------+ | | +--------------+ | +# | | | | +# | rt-1 netns | | rt-2 netns | +# | | | | +# +-----------------------------------+ +-----------------------------------+ +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# | Network configuration | +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# rt-1: localsid table +# +----------------------------------------------------------------+ +# |SID |Action | +# +----------------------------------------------------------------+ +# |fc00:21:100::6004|apply SRv6 End.DX6 nh6 cafe::1 dev veth-t100 | +# +----------------------------------------------------------------+ +# +# rt-1: route table +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |cafe::2 |apply seg6 encap segs fc00:12:100::6004| +# +---------------------------------------------------+ +# |cafe::/64 |forward to dev veth_t100 | +# +---------------------------------------------------+ +# +# +# rt-2: localsid table +# +---------------------------------------------------------------+ +# |SID |Action | +# +---------------------------------------------------------------+ +# |fc00:12:100::6004|apply SRv6 End.DX6 nh6 cafe::2 dev veth-t100 | +# +---------------------------------------------------------------+ +# +# rt-2: route table +# +---------------------------------------------------+ +# |host |Action | +# +---------------------------------------------------+ +# |cafe::1 |apply seg6 encap segs fc00:21:100::6004| +# +---------------------------------------------------+ +# |cafe::/64 |forward to dev veth_t100 | +# +---------------------------------------------------+ +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +readonly IPv6_RT_NETWORK=2001:11 +readonly IPv6_HS_NETWORK=cafe +readonly SID_LOCATOR=fc00 + +PING_TIMEOUT_SEC=4 + +ret=0 + +PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + nsuccess=$((nsuccess+1)) + printf "\n TEST: %-60s [ OK ]\n" "${msg}" + else + ret=1 + nfail=$((nfail+1)) + printf "\n TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +print_log_test_results() +{ + if [ "$TESTS" != "none" ]; then + printf "\nTests passed: %3d\n" ${nsuccess} + printf "Tests failed: %3d\n" ${nfail} + fi +} + +log_section() +{ + echo + echo "################################################################################" + echo "TEST SECTION: $*" + echo "################################################################################" +} + +cleanup() +{ + ip link del veth-rt-1 2>/dev/null || true + ip link del veth-rt-2 2>/dev/null || true + + # destroy routers rt-* and hosts hs-* + for ns in $(ip netns show | grep -E 'rt-*|hs-*'); do + ip netns del ${ns} || true + done +} + +# Setup the basic networking for the routers +setup_rt_networking() +{ + local rt=$1 + local nsname=rt-${rt} + + ip netns add ${nsname} + + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.accept_dad=0 + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.default.accept_dad=0 + + ip link set veth-rt-${rt} netns ${nsname} + ip -netns ${nsname} link set veth-rt-${rt} name veth0 + + ip -netns ${nsname} addr add ${IPv6_RT_NETWORK}::${rt}/64 dev veth0 nodad + ip -netns ${nsname} link set veth0 up + ip -netns ${nsname} link set lo up + + ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.forwarding=1 +} + +setup_rt_netfilter() +{ + local rt=$1 + local nsname=rt-${rt} + + ip netns exec ${nsname} sysctl -wq net.netfilter.nf_hooks_lwtunnel=1 + ip netns exec ${nsname} ip6tables -t raw -A PREROUTING -m rpfilter --invert -j DROP +} + +setup_hs() +{ + local hs=$1 + local rt=$2 + local tid=$3 + local hsname=hs-${hs} + local rtname=rt-${rt} + local rtveth=veth-t${tid} + + # set the networking for the host + ip netns add ${hsname} + + ip -netns ${hsname} link add veth0 type veth peer name ${rtveth} + ip -netns ${hsname} link set ${rtveth} netns ${rtname} + ip -netns ${hsname} addr add ${IPv6_HS_NETWORK}::${hs}/64 dev veth0 nodad + ip -netns ${hsname} link set veth0 up + ip -netns ${hsname} link set lo up + + ip -netns ${rtname} addr add ${IPv6_HS_NETWORK}::${rt}${hs}/64 dev ${rtveth} + ip -netns ${rtname} link set ${rtveth} up + + ip netns exec ${rtname} sysctl -wq net.ipv6.conf.all.accept_dad=0 + ip netns exec ${rtname} sysctl -wq net.ipv6.conf.default.accept_dad=0 + + ip netns exec ${rtname} sysctl -wq net.ipv6.conf.${rtveth}.proxy_ndp=1 +} + +setup_vpn_config() +{ + local hssrc=$1 + local rtsrc=$2 + local hsdst=$3 + local rtdst=$4 + local tid=$5 + + local hssrc_name=hs-t${tid}-${hssrc} + local hsdst_name=hs-t${tid}-${hsdst} + local rtsrc_name=rt-${rtsrc} + local rtdst_name=rt-${rtdst} + local rtveth=veth-t${tid} + local vpn_sid=${SID_LOCATOR}:${hssrc}${hsdst}:${tid}::6004 + + ip -netns ${rtsrc_name} -6 neigh add proxy ${IPv6_HS_NETWORK}::${hsdst} dev ${rtveth} + + # set the encap route for encapsulating packets which arrive from the + # host hssrc and destined to the access router rtsrc. + ip -netns ${rtsrc_name} -6 route add ${IPv6_HS_NETWORK}::${hsdst}/128 \ + encap seg6 mode encap segs ${vpn_sid} dev veth0 + ip -netns ${rtsrc_name} -6 route add ${vpn_sid}/128 \ + via 2001:11::${rtdst} dev veth0 + + # set the decap route for decapsulating packets which arrive from + # the rtdst router and destined to the hsdst host. + ip -netns ${rtdst_name} -6 route add ${vpn_sid}/128 \ + encap seg6local action End.DX6 nh6 ${IPv6_HS_NETWORK}::${hsdst} dev veth-t${tid} +} + +setup() +{ + ip link add veth-rt-1 type veth peer name veth-rt-2 + # setup the networking for router rt-1 and router rt-2 + setup_rt_networking 1 + setup_rt_networking 2 + + # setup two hosts for the tenant 100. + # - host hs-1 is directly connected to the router rt-1; + # - host hs-2 is directly connected to the router rt-2. + setup_hs 1 1 100 + setup_hs 2 2 100 + + # setup the IPv4 L3 VPN which connects the host hs-1 and host hs-2. + setup_vpn_config 1 1 2 2 100 #args: src_host src_router dst_host dst_router tenant + setup_vpn_config 2 2 1 1 100 +} + +check_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + ip netns exec hs-${hssrc} ping -6 -c 1 -W ${PING_TIMEOUT_SEC} \ + ${IPv6_HS_NETWORK}::${hsdst} >/dev/null 2>&1 +} + +check_and_log_hs_connectivity() +{ + local hssrc=$1 + local hsdst=$2 + local tid=$3 + + check_hs_connectivity ${hssrc} ${hsdst} ${tid} + log_test $? 0 "Hosts connectivity: hs-${hssrc} -> hs-${hsdst} (tenant ${tid})" +} + +host_tests() +{ + log_section "SRv6 VPN connectivity test among hosts in the same tenant" + + check_and_log_hs_connectivity 1 2 100 + check_and_log_hs_connectivity 2 1 100 +} + +router_netfilter_tests() +{ + log_section "SRv6 VPN connectivity test with netfilter enabled in routers" + setup_rt_netfilter 1 + setup_rt_netfilter 2 + + check_and_log_hs_connectivity 1 2 100 + check_and_log_hs_connectivity 2 1 100 +} + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +cleanup &>/dev/null + +setup + +host_tests +router_netfilter_tests + +print_log_test_results + +cleanup &>/dev/null + +exit ${ret} From e2654a4453ba3dac9baacf9980d841d84e15b869 Mon Sep 17 00:00:00 2001 From: Roman Li Date: Tue, 7 May 2024 16:26:08 -0400 Subject: [PATCH 185/272] drm/amd/display: Remove redundant idle optimization check [Why] Disable idle optimization for each atomic commit is unnecessary, and can lead to a potential race condition. [How] Remove idle optimization check from amdgpu_dm_atomic_commit_tail() Fixes: 196107eb1e15 ("drm/amd/display: Add IPS checks before dcn register access") Cc: stable@vger.kernel.org Reviewed-by: Hamza Mahfooz Acked-by: Roman Li Signed-off-by: Roman Li Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index f1d67c6f4b98f..e426adf95d7de 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -9169,9 +9169,6 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) trace_amdgpu_dm_atomic_commit_tail_begin(state); - if (dm->dc->caps.ips_support && dm->dc->idle_optimizations_allowed) - dc_allow_idle_optimizations(dm->dc, false); - drm_atomic_helper_update_legacy_modeset_state(dev, state); drm_dp_mst_atomic_wait_for_dependencies(state); From 84801d4f1e4fbd2c44dddecaec9099bdff100a42 Mon Sep 17 00:00:00 2001 From: Yunxiang Li Date: Thu, 23 May 2024 07:48:19 -0400 Subject: [PATCH 186/272] drm/amdgpu: fix locking scope when flushing tlb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Which method is used to flush tlb does not depend on whether a reset is in progress or not. We should skip flush altogether if the GPU will get reset. So put both path under reset_domain read lock. Signed-off-by: Yunxiang Li Reviewed-by: Christian König Signed-off-by: Alex Deucher CC: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 66 +++++++++++++------------ 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index be4629cdac049..08b9dfb653355 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -684,12 +684,17 @@ int amdgpu_gmc_flush_gpu_tlb_pasid(struct amdgpu_device *adev, uint16_t pasid, struct amdgpu_ring *ring = &adev->gfx.kiq[inst].ring; struct amdgpu_kiq *kiq = &adev->gfx.kiq[inst]; unsigned int ndw; - signed long r; + int r; uint32_t seq; - if (!adev->gmc.flush_pasid_uses_kiq || !ring->sched.ready || - !down_read_trylock(&adev->reset_domain->sem)) { + /* + * A GPU reset should flush all TLBs anyway, so no need to do + * this while one is ongoing. + */ + if (!down_read_trylock(&adev->reset_domain->sem)) + return 0; + if (!adev->gmc.flush_pasid_uses_kiq || !ring->sched.ready) { if (adev->gmc.flush_tlb_needs_extra_type_2) adev->gmc.gmc_funcs->flush_gpu_tlb_pasid(adev, pasid, 2, all_hub, @@ -703,43 +708,40 @@ int amdgpu_gmc_flush_gpu_tlb_pasid(struct amdgpu_device *adev, uint16_t pasid, adev->gmc.gmc_funcs->flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub, inst); - return 0; - } + r = 0; + } else { + /* 2 dwords flush + 8 dwords fence */ + ndw = kiq->pmf->invalidate_tlbs_size + 8; - /* 2 dwords flush + 8 dwords fence */ - ndw = kiq->pmf->invalidate_tlbs_size + 8; + if (adev->gmc.flush_tlb_needs_extra_type_2) + ndw += kiq->pmf->invalidate_tlbs_size; - if (adev->gmc.flush_tlb_needs_extra_type_2) - ndw += kiq->pmf->invalidate_tlbs_size; + if (adev->gmc.flush_tlb_needs_extra_type_0) + ndw += kiq->pmf->invalidate_tlbs_size; - if (adev->gmc.flush_tlb_needs_extra_type_0) - ndw += kiq->pmf->invalidate_tlbs_size; + spin_lock(&adev->gfx.kiq[inst].ring_lock); + amdgpu_ring_alloc(ring, ndw); + if (adev->gmc.flush_tlb_needs_extra_type_2) + kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 2, all_hub); - spin_lock(&adev->gfx.kiq[inst].ring_lock); - amdgpu_ring_alloc(ring, ndw); - if (adev->gmc.flush_tlb_needs_extra_type_2) - kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 2, all_hub); + if (flush_type == 2 && adev->gmc.flush_tlb_needs_extra_type_0) + kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 0, all_hub); - if (flush_type == 2 && adev->gmc.flush_tlb_needs_extra_type_0) - kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 0, all_hub); + kiq->pmf->kiq_invalidate_tlbs(ring, pasid, flush_type, all_hub); + r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT); + if (r) { + amdgpu_ring_undo(ring); + spin_unlock(&adev->gfx.kiq[inst].ring_lock); + goto error_unlock_reset; + } - kiq->pmf->kiq_invalidate_tlbs(ring, pasid, flush_type, all_hub); - r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT); - if (r) { - amdgpu_ring_undo(ring); + amdgpu_ring_commit(ring); spin_unlock(&adev->gfx.kiq[inst].ring_lock); - goto error_unlock_reset; - } - - amdgpu_ring_commit(ring); - spin_unlock(&adev->gfx.kiq[inst].ring_lock); - r = amdgpu_fence_wait_polling(ring, seq, usec_timeout); - if (r < 1) { - dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r); - r = -ETIME; - goto error_unlock_reset; + if (amdgpu_fence_wait_polling(ring, seq, usec_timeout) < 1) { + dev_err(adev->dev, "timeout waiting for kiq fence\n"); + r = -ETIME; + } } - r = 0; error_unlock_reset: up_read(&adev->reset_domain->sem); From 56342da3d8cc15efe9df7f29985ba8d256bdc258 Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz Date: Mon, 3 Jun 2024 10:16:45 -0400 Subject: [PATCH 187/272] drm/amd/display: prevent register access while in IPS We can't read/write to DCN registers while in IPS. Since, that can cause the system to hang. So, before proceeding with the access in that scenario, force the system out of IPS. Cc: stable@vger.kernel.org # 6.6+ Reviewed-by: Roman Li Signed-off-by: Hamza Mahfooz Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index e426adf95d7de..e9ac20bed0f2b 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -11437,6 +11437,12 @@ void amdgpu_dm_trigger_timing_sync(struct drm_device *dev) mutex_unlock(&adev->dm.dc_lock); } +static inline void amdgpu_dm_exit_ips_for_hw_access(struct dc *dc) +{ + if (dc->ctx->dmub_srv && !dc->ctx->dmub_srv->idle_exit_counter) + dc_exit_ips_for_hw_access(dc); +} + void dm_write_reg_func(const struct dc_context *ctx, uint32_t address, u32 value, const char *func_name) { @@ -11447,6 +11453,8 @@ void dm_write_reg_func(const struct dc_context *ctx, uint32_t address, return; } #endif + + amdgpu_dm_exit_ips_for_hw_access(ctx->dc); cgs_write_register(ctx->cgs_device, address, value); trace_amdgpu_dc_wreg(&ctx->perf_trace->write_count, address, value); } @@ -11470,6 +11478,8 @@ uint32_t dm_read_reg_func(const struct dc_context *ctx, uint32_t address, return 0; } + amdgpu_dm_exit_ips_for_hw_access(ctx->dc); + value = cgs_read_register(ctx->cgs_device, address); trace_amdgpu_dc_rreg(&ctx->perf_trace->read_count, address, value); From 49c9ffabde555c841392858d8b9e6cf58998a50c Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Wed, 5 Jun 2024 09:30:50 -0400 Subject: [PATCH 188/272] drm/amdgpu: Indicate CU havest info to CP To achieve full occupancy CP hardware needs to know if CUs in SE are symmetrically or asymmetrically harvested v2: Reset is_symmetric_cus for each loop Signed-off-by: Harish Kasiviswanathan Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 7b16e8cca86ac..f5b9f443cfdd7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -4195,9 +4195,10 @@ static u32 gfx_v9_4_3_get_cu_active_bitmap(struct amdgpu_device *adev, int xcc_i static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev, struct amdgpu_cu_info *cu_info) { - int i, j, k, counter, xcc_id, active_cu_number = 0; - u32 mask, bitmap, ao_bitmap, ao_cu_mask = 0; + int i, j, k, prev_counter, counter, xcc_id, active_cu_number = 0; + u32 mask, bitmap, ao_bitmap, ao_cu_mask = 0, tmp; unsigned disable_masks[4 * 4]; + bool is_symmetric_cus; if (!adev || !cu_info) return -EINVAL; @@ -4215,6 +4216,7 @@ static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev, mutex_lock(&adev->grbm_idx_mutex); for (xcc_id = 0; xcc_id < NUM_XCC(adev->gfx.xcc_mask); xcc_id++) { + is_symmetric_cus = true; for (i = 0; i < adev->gfx.config.max_shader_engines; i++) { for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) { mask = 1; @@ -4242,6 +4244,15 @@ static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev, ao_cu_mask |= (ao_bitmap << (i * 16 + j * 8)); cu_info->ao_cu_bitmap[i][j] = ao_bitmap; } + if (i && is_symmetric_cus && prev_counter != counter) + is_symmetric_cus = false; + prev_counter = counter; + } + if (is_symmetric_cus) { + tmp = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_DEBUG); + tmp = REG_SET_FIELD(tmp, CP_CPC_DEBUG, CPC_HARVESTING_RELAUNCH_DISABLE, 1); + tmp = REG_SET_FIELD(tmp, CP_CPC_DEBUG, CPC_HARVESTING_DISPATCH_DISABLE, 1); + WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_CPC_DEBUG, tmp); } gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, xcc_id); From 8bd82363e2ee2eb3a9a8ea1fa94ebe1900d05a71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 5 Jun 2024 13:27:20 +0200 Subject: [PATCH 189/272] drm/amdgpu: revert "take runtime pm reference when we attach a buffer" v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit b8c415e3bf98 ("drm/amdgpu: take runtime pm reference when we attach a buffer") and commit 425285d39afd ("drm/amdgpu: add amdgpu runpm usage trace for separate funcs"). Taking a runtime pm reference for DMA-buf is actually completely unnecessary and even dangerous. The problem is that calling pm_runtime_get_sync() from the DMA-buf callbacks is illegal because we have the reservation locked here which is also taken during resume. So this would deadlock. When the buffer is in GTT it is still accessible even when the GPU is powered down and when it is in VRAM the buffer gets migrated to GTT before powering down. The only use case which would make it mandatory to keep the runtime pm reference would be if we pin the buffer into VRAM, and that's not something we currently do. v2: improve the commit message Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher CC: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 34 --------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 2 -- drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h | 15 --------- 3 files changed, 51 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index 055ba2ea4c126..662d0f28f3587 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -41,8 +41,6 @@ #include #include #include -#include -#include "amdgpu_trace.h" /** * amdgpu_dma_buf_attach - &dma_buf_ops.attach implementation @@ -58,42 +56,11 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf, struct drm_gem_object *obj = dmabuf->priv; struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); - int r; if (pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0) attach->peer2peer = false; - r = pm_runtime_get_sync(adev_to_drm(adev)->dev); - trace_amdgpu_runpm_reference_dumps(1, __func__); - if (r < 0) - goto out; - return 0; - -out: - pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); - trace_amdgpu_runpm_reference_dumps(0, __func__); - return r; -} - -/** - * amdgpu_dma_buf_detach - &dma_buf_ops.detach implementation - * - * @dmabuf: DMA-buf where we remove the attachment from - * @attach: the attachment to remove - * - * Called when an attachment is removed from the DMA-buf. - */ -static void amdgpu_dma_buf_detach(struct dma_buf *dmabuf, - struct dma_buf_attachment *attach) -{ - struct drm_gem_object *obj = dmabuf->priv; - struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); - struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); - - pm_runtime_mark_last_busy(adev_to_drm(adev)->dev); - pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); - trace_amdgpu_runpm_reference_dumps(0, __func__); } /** @@ -267,7 +234,6 @@ static int amdgpu_dma_buf_begin_cpu_access(struct dma_buf *dma_buf, const struct dma_buf_ops amdgpu_dmabuf_ops = { .attach = amdgpu_dma_buf_attach, - .detach = amdgpu_dma_buf_detach, .pin = amdgpu_dma_buf_pin, .unpin = amdgpu_dma_buf_unpin, .map_dma_buf = amdgpu_dma_buf_map, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 10832b4704484..bc3ac73b6b8d0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -181,7 +181,6 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, seq, flags | AMDGPU_FENCE_FLAG_INT); pm_runtime_get_noresume(adev_to_drm(adev)->dev); - trace_amdgpu_runpm_reference_dumps(1, __func__); ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask]; if (unlikely(rcu_dereference_protected(*ptr, 1))) { struct dma_fence *old; @@ -309,7 +308,6 @@ bool amdgpu_fence_process(struct amdgpu_ring *ring) dma_fence_put(fence); pm_runtime_mark_last_busy(adev_to_drm(adev)->dev); pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); - trace_amdgpu_runpm_reference_dumps(0, __func__); } while (last_seq != seq); return true; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h index 7aafeb763e5dd..383fce40d4dd7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h @@ -554,21 +554,6 @@ TRACE_EVENT(amdgpu_reset_reg_dumps, __entry->value) ); -TRACE_EVENT(amdgpu_runpm_reference_dumps, - TP_PROTO(uint32_t index, const char *func), - TP_ARGS(index, func), - TP_STRUCT__entry( - __field(uint32_t, index) - __string(func, func) - ), - TP_fast_assign( - __entry->index = index; - __assign_str(func); - ), - TP_printk("amdgpu runpm reference dump 0x%x: 0x%s\n", - __entry->index, - __get_str(func)) -); #undef AMDGPU_JOB_GET_TIMELINE_NAME #endif From c60e20f13c27662de36cd5538d6299760780db52 Mon Sep 17 00:00:00 2001 From: Daniel Miess Date: Tue, 28 May 2024 16:17:17 -0400 Subject: [PATCH 190/272] drm/amd/display: Change dram_clock_latency to 34us for dcn351 [Why] Intermittent underflow observed when using 4k144 display on dcn351 [How] Update dram_clock_change_latency_us from 11.72us to 34us Reviewed-by: Nicholas Kazlauskas Acked-by: Zaeem Mohamed Signed-off-by: Daniel Miess Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dml/dcn351/dcn351_fpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn351/dcn351_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn351/dcn351_fpu.c index e4f333d4fb54f..a201dbb743d79 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn351/dcn351_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn351/dcn351_fpu.c @@ -215,7 +215,7 @@ struct _vcs_dpi_soc_bounding_box_st dcn3_51_soc = { .urgent_latency_pixel_data_only_us = 4.0, .urgent_latency_pixel_mixed_with_vm_data_us = 4.0, .urgent_latency_vm_data_only_us = 4.0, - .dram_clock_change_latency_us = 11.72, + .dram_clock_change_latency_us = 34, .urgent_out_of_order_return_per_channel_pixel_only_bytes = 4096, .urgent_out_of_order_return_per_channel_pixel_and_vm_bytes = 4096, .urgent_out_of_order_return_per_channel_vm_only_bytes = 4096, From 6071607bfefefc50a3907c0ba88878846960d29a Mon Sep 17 00:00:00 2001 From: Paul Hsieh Date: Tue, 28 May 2024 14:36:00 +0800 Subject: [PATCH 191/272] drm/amd/display: change dram_clock_latency to 34us for dcn35 [Why & How] Current DRAM setting would cause underflow on customer platform. Modify dram_clock_change_latency_us from 11.72 to 34.0 us as per recommendation from HW team Reviewed-by: Nicholas Kazlauskas Acked-by: Zaeem Mohamed Signed-off-by: Paul Hsieh Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c index 60f251cf973b1..beed7adbbd43e 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c @@ -177,7 +177,7 @@ struct _vcs_dpi_soc_bounding_box_st dcn3_5_soc = { .urgent_latency_pixel_data_only_us = 4.0, .urgent_latency_pixel_mixed_with_vm_data_us = 4.0, .urgent_latency_vm_data_only_us = 4.0, - .dram_clock_change_latency_us = 11.72, + .dram_clock_change_latency_us = 34.0, .urgent_out_of_order_return_per_channel_pixel_only_bytes = 4096, .urgent_out_of_order_return_per_channel_pixel_and_vm_bytes = 4096, .urgent_out_of_order_return_per_channel_vm_only_bytes = 4096, From c03d770c0b014a3007a5874bf6b3c3e64d32aaac Mon Sep 17 00:00:00 2001 From: Michael Strauss Date: Tue, 7 May 2024 12:03:15 -0400 Subject: [PATCH 192/272] drm/amd/display: Attempt to avoid empty TUs when endpoint is DPIA [WHY] Empty SST TUs are illegal to transmit over a USB4 DP tunnel. Current policy is to configure stream encoder to pack 2 pixels per pclk even when ODM combine is not in use, allowing seamless dynamic ODM reconfiguration. However, in extreme edge cases where average pixel count per TU is less than 2, this can lead to unexpected empty TU generation during compliance testing. For example, VIC 1 with a 1xHBR3 link configuration will average 1.98 pix/TU. [HOW] Calculate average pixel count per TU, and block 2 pixels per clock if endpoint is a DPIA tunnel and pixel clock is low enough that we will never require 2:1 ODM combine. Cc: stable@vger.kernel.org # 6.6+ Reviewed-by: Wenjing Liu Acked-by: Hamza Mahfooz Signed-off-by: Michael Strauss Signed-off-by: Alex Deucher --- .../amd/display/dc/hwss/dcn35/dcn35_hwseq.c | 72 +++++++++++++++++++ .../amd/display/dc/hwss/dcn35/dcn35_hwseq.h | 2 + .../amd/display/dc/hwss/dcn35/dcn35_init.c | 2 +- 3 files changed, 75 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c index 5295f52e4fc84..dcced89c07b38 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c @@ -1439,3 +1439,75 @@ void dcn35_set_long_vblank(struct pipe_ctx **pipe_ctx, } } } + +static bool should_avoid_empty_tu(struct pipe_ctx *pipe_ctx) +{ + /* Calculate average pixel count per TU, return false if under ~2.00 to + * avoid empty TUs. This is only required for DPIA tunneling as empty TUs + * are legal to generate for native DP links. Assume TU size 64 as there + * is currently no scenario where it's reprogrammed from HW default. + * MTPs have no such limitation, so this does not affect MST use cases. + */ + unsigned int pix_clk_mhz; + unsigned int symclk_mhz; + unsigned int avg_pix_per_tu_x1000; + unsigned int tu_size_bytes = 64; + struct dc_crtc_timing *timing = &pipe_ctx->stream->timing; + struct dc_link_settings *link_settings = &pipe_ctx->link_config.dp_link_settings; + const struct dc *dc = pipe_ctx->stream->link->dc; + + if (pipe_ctx->stream->link->ep_type != DISPLAY_ENDPOINT_USB4_DPIA) + return false; + + // Not necessary for MST configurations + if (pipe_ctx->stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST) + return false; + + pix_clk_mhz = timing->pix_clk_100hz / 10000; + + // If this is true, can't block due to dynamic ODM + if (pix_clk_mhz > dc->clk_mgr->bw_params->clk_table.entries[0].dispclk_mhz) + return false; + + switch (link_settings->link_rate) { + case LINK_RATE_LOW: + symclk_mhz = 162; + break; + case LINK_RATE_HIGH: + symclk_mhz = 270; + break; + case LINK_RATE_HIGH2: + symclk_mhz = 540; + break; + case LINK_RATE_HIGH3: + symclk_mhz = 810; + break; + default: + // We shouldn't be tunneling any other rates, something is wrong + ASSERT(0); + return false; + } + + avg_pix_per_tu_x1000 = (1000 * pix_clk_mhz * tu_size_bytes) + / (symclk_mhz * link_settings->lane_count); + + // Add small empirically-decided margin to account for potential jitter + return (avg_pix_per_tu_x1000 < 2020); +} + +bool dcn35_is_dp_dig_pixel_rate_div_policy(struct pipe_ctx *pipe_ctx) +{ + struct dc *dc = pipe_ctx->stream->ctx->dc; + + if (!is_h_timing_divisible_by_2(pipe_ctx->stream)) + return false; + + if (should_avoid_empty_tu(pipe_ctx)) + return false; + + if (dc_is_dp_signal(pipe_ctx->stream->signal) && !dc->link_srv->dp_is_128b_132b_signal(pipe_ctx) && + dc->debug.enable_dp_dig_pixel_rate_div_policy) + return true; + + return false; +} diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.h b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.h index a731c8880d60a..f0ea7d1511ae6 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.h +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.h @@ -95,4 +95,6 @@ void dcn35_set_static_screen_control(struct pipe_ctx **pipe_ctx, void dcn35_set_long_vblank(struct pipe_ctx **pipe_ctx, int num_pipes, uint32_t v_total_min, uint32_t v_total_max); +bool dcn35_is_dp_dig_pixel_rate_div_policy(struct pipe_ctx *pipe_ctx); + #endif /* __DC_HWSS_DCN35_H__ */ diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c index df3bf77f3fb46..199781233fd5f 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c @@ -158,7 +158,7 @@ static const struct hwseq_private_funcs dcn35_private_funcs = { .setup_hpo_hw_control = dcn35_setup_hpo_hw_control, .calculate_dccg_k1_k2_values = dcn32_calculate_dccg_k1_k2_values, .set_pixels_per_cycle = dcn32_set_pixels_per_cycle, - .is_dp_dig_pixel_rate_div_policy = dcn32_is_dp_dig_pixel_rate_div_policy, + .is_dp_dig_pixel_rate_div_policy = dcn35_is_dp_dig_pixel_rate_div_policy, .dsc_pg_control = dcn35_dsc_pg_control, .dsc_pg_status = dcn32_dsc_pg_status, .enable_plane = dcn35_enable_plane, From 301daa346f0e34a87fb6c1e4a05db2aa0a66b573 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 14 Jun 2024 12:54:52 -0700 Subject: [PATCH 193/272] drm/amd/display: Disable CONFIG_DRM_AMD_DC_FP for RISC-V with clang Commit 77acc6b55ae4 ("riscv: add support for kernel-mode FPU") and commit a28e4b672f04 ("drm/amd/display: use ARCH_HAS_KERNEL_FPU_SUPPORT") enabled support for CONFIG_DRM_AMD_DC_FP with RISC-V. Unfortunately, this exposed -Wframe-larger-than warnings (which become fatal with CONFIG_WERROR=y) when building ARCH=riscv allmodconfig with clang: drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/display_mode_vba_32.c:58:13: error: stack frame size (2448) exceeds limit (2048) in 'DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerformanceCalculation' [-Werror,-Wframe-larger-than] 58 | static void DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerformanceCalculation( | ^ 1 error generated. Many functions in this file use a large number of parameters, which must be passed on the stack at a certain pointer due to register exhaustion, which can cause high stack usage when inlining and issues with stack slot analysis get involved. While the compiler can and should do better (as GCC uses less than half the amount of stack space for the same function), it is not as simple as a fix as adjusting the functions not to take a large number of parameters. Unfortunately, modifying these files to avoid the problem is a difficult to justify approach because any revisions to the files in the kernel tree never make it back to the original source (so copies of the code for newer hardware revisions just reintroduce the issue) and the files are hard to read/modify due to being "gcc-parsable HW gospel, coming straight from HW engineers". Avoid building the problematic code for RISC-V by modifying the existing condition for arm64 that exists for the same reason. Factor out the logical not to make the condition a little more readable naturally. Fixes: a28e4b672f04 ("drm/amd/display: use ARCH_HAS_KERNEL_FPU_SUPPORT") Reported-by: Palmer Dabbelt Closes: https://lore.kernel.org/20240530145741.7506-2-palmer@rivosinc.com/ Reviewed-by: Harry Wentland Signed-off-by: Nathan Chancellor Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig index 5fcd4f778dc3d..47b8b49da8a72 100644 --- a/drivers/gpu/drm/amd/display/Kconfig +++ b/drivers/gpu/drm/amd/display/Kconfig @@ -8,7 +8,7 @@ config DRM_AMD_DC depends on BROKEN || !CC_IS_CLANG || ARM64 || RISCV || SPARC64 || X86_64 select SND_HDA_COMPONENT if SND_HDA_CORE # !CC_IS_CLANG: https://github.com/ClangBuiltLinux/linux/issues/1752 - select DRM_AMD_DC_FP if ARCH_HAS_KERNEL_FPU_SUPPORT && (!ARM64 || !CC_IS_CLANG) + select DRM_AMD_DC_FP if ARCH_HAS_KERNEL_FPU_SUPPORT && !(CC_IS_CLANG && (ARM64 || RISCV)) help Choose this option if you want to use the new display engine support for AMDGPU. This adds required support for Vega and From 8bf0287528da1992c5e49d757b99ad6bbc34b522 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 19 Jun 2024 14:46:48 -0500 Subject: [PATCH 194/272] cifs: fix typo in module parameter enable_gcm_256 enable_gcm_256 (which allows the server to require the strongest encryption) is enabled by default, but the modinfo description incorrectly showed it disabled by default. Fix the typo. Cc: stable@vger.kernel.org Fixes: fee742b50289 ("smb3.1.1: enable negotiating stronger encryption by default") Signed-off-by: Steve French --- fs/smb/client/cifsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index bb86fc0641d83..6397fdefd876d 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -134,7 +134,7 @@ module_param(enable_oplocks, bool, 0644); MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); module_param(enable_gcm_256, bool, 0644); -MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: n/N/0"); +MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: y/Y/0"); module_param(require_gcm_256, bool, 0644); MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0"); From a498df5421fd737d11bfd152428ba6b1c8538321 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 20 May 2024 09:11:45 -0400 Subject: [PATCH 195/272] drm/radeon: fix UBSAN warning in kv_dpm.c Adds bounds check for sumo_vid_mapping_entry. Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/sumo_dpm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/radeon/sumo_dpm.c b/drivers/gpu/drm/radeon/sumo_dpm.c index 21d27e6235f39..b11f7c5bbcbe9 100644 --- a/drivers/gpu/drm/radeon/sumo_dpm.c +++ b/drivers/gpu/drm/radeon/sumo_dpm.c @@ -1619,6 +1619,8 @@ void sumo_construct_vid_mapping_table(struct radeon_device *rdev, for (i = 0; i < SUMO_MAX_HARDWARE_POWERLEVELS; i++) { if (table[i].ulSupportedSCLK != 0) { + if (table[i].usVoltageIndex >= SUMO_MAX_NUMBER_VOLTAGES) + continue; vid_mapping_table->entries[table[i].usVoltageIndex].vid_7bit = table[i].usVoltageID; vid_mapping_table->entries[table[i].usVoltageIndex].vid_2bit = From f0d576f840153392d04b2d52cf3adab8f62e8cb6 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 20 May 2024 09:05:21 -0400 Subject: [PATCH 196/272] drm/amdgpu: fix UBSAN warning in kv_dpm.c Adds bounds check for sumo_vid_mapping_entry. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3392 Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c index 6bb42d04b2479..e8b6989a40f35 100644 --- a/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c @@ -164,6 +164,8 @@ static void sumo_construct_vid_mapping_table(struct amdgpu_device *adev, for (i = 0; i < SUMO_MAX_HARDWARE_POWERLEVELS; i++) { if (table[i].ulSupportedSCLK != 0) { + if (table[i].usVoltageIndex >= SUMO_MAX_NUMBER_VOLTAGES) + continue; vid_mapping_table->entries[table[i].usVoltageIndex].vid_7bit = table[i].usVoltageID; vid_mapping_table->entries[table[i].usVoltageIndex].vid_2bit = From e356d321d0240663a09b139fa3658ddbca163e27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Fri, 31 May 2024 10:56:00 +0200 Subject: [PATCH 197/272] drm/amdgpu: cleanup MES11 command submission MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The approach of having a separate WB slot for each submission doesn't really work well and for example breaks GPU reset. Use a status query packet for the fence update instead since those should always succeed we can use the fence of the original packet to signal the state of the operation. While at it cleanup the coding style. Fixes: eef016ba8986 ("drm/amdgpu/mes11: Use a separate fence per transaction") Reviewed-by: Mukul Joshi Signed-off-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 76 ++++++++++++++++---------- 1 file changed, 48 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 0d1407f250059..32d4519541c6b 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -154,18 +154,18 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, void *pkt, int size, int api_status_off) { - int ndw = size / 4; - signed long r; - union MESAPI__MISC *x_pkt = pkt; - struct MES_API_STATUS *api_status; + union MESAPI__QUERY_MES_STATUS mes_status_pkt; + signed long timeout = 3000000; /* 3000 ms */ struct amdgpu_device *adev = mes->adev; struct amdgpu_ring *ring = &mes->ring; - unsigned long flags; - signed long timeout = 3000000; /* 3000 ms */ + struct MES_API_STATUS *api_status; + union MESAPI__MISC *x_pkt = pkt; const char *op_str, *misc_op_str; - u32 fence_offset; - u64 fence_gpu_addr; - u64 *fence_ptr; + unsigned long flags; + u64 status_gpu_addr; + u32 status_offset; + u64 *status_ptr; + signed long r; int ret; if (x_pkt->header.opcode >= MES_SCH_API_MAX) @@ -177,28 +177,38 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, /* Worst case in sriov where all other 15 VF timeout, each VF needs about 600ms */ timeout = 15 * 600 * 1000; } - BUG_ON(size % 4 != 0); - ret = amdgpu_device_wb_get(adev, &fence_offset); + ret = amdgpu_device_wb_get(adev, &status_offset); if (ret) return ret; - fence_gpu_addr = - adev->wb.gpu_addr + (fence_offset * 4); - fence_ptr = (u64 *)&adev->wb.wb[fence_offset]; - *fence_ptr = 0; + + status_gpu_addr = adev->wb.gpu_addr + (status_offset * 4); + status_ptr = (u64 *)&adev->wb.wb[status_offset]; + *status_ptr = 0; spin_lock_irqsave(&mes->ring_lock, flags); - if (amdgpu_ring_alloc(ring, ndw)) { - spin_unlock_irqrestore(&mes->ring_lock, flags); - amdgpu_device_wb_free(adev, fence_offset); - return -ENOMEM; - } + r = amdgpu_ring_alloc(ring, (size + sizeof(mes_status_pkt)) / 4); + if (r) + goto error_unlock_free; api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off); - api_status->api_completion_fence_addr = fence_gpu_addr; + api_status->api_completion_fence_addr = status_gpu_addr; api_status->api_completion_fence_value = 1; - amdgpu_ring_write_multiple(ring, pkt, ndw); + amdgpu_ring_write_multiple(ring, pkt, size / 4); + + memset(&mes_status_pkt, 0, sizeof(mes_status_pkt)); + mes_status_pkt.header.type = MES_API_TYPE_SCHEDULER; + mes_status_pkt.header.opcode = MES_SCH_API_QUERY_SCHEDULER_STATUS; + mes_status_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; + mes_status_pkt.api_status.api_completion_fence_addr = + ring->fence_drv.gpu_addr; + mes_status_pkt.api_status.api_completion_fence_value = + ++ring->fence_drv.sync_seq; + + amdgpu_ring_write_multiple(ring, &mes_status_pkt, + sizeof(mes_status_pkt) / 4); + amdgpu_ring_commit(ring); spin_unlock_irqrestore(&mes->ring_lock, flags); @@ -206,15 +216,16 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, misc_op_str = mes_v11_0_get_misc_op_string(x_pkt); if (misc_op_str) - dev_dbg(adev->dev, "MES msg=%s (%s) was emitted\n", op_str, misc_op_str); + dev_dbg(adev->dev, "MES msg=%s (%s) was emitted\n", op_str, + misc_op_str); else if (op_str) dev_dbg(adev->dev, "MES msg=%s was emitted\n", op_str); else - dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode); + dev_dbg(adev->dev, "MES msg=%d was emitted\n", + x_pkt->header.opcode); - r = amdgpu_mes_fence_wait_polling(fence_ptr, (u64)1, timeout); - amdgpu_device_wb_free(adev, fence_offset); - if (r < 1) { + r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, timeout); + if (r < 1 || !*status_ptr) { if (misc_op_str) dev_err(adev->dev, "MES failed to respond to msg=%s (%s)\n", @@ -229,10 +240,19 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, while (halt_if_hws_hang) schedule(); - return -ETIMEDOUT; + r = -ETIMEDOUT; + goto error_wb_free; } + amdgpu_device_wb_free(adev, status_offset); return 0; + +error_unlock_free: + spin_unlock_irqrestore(&mes->ring_lock, flags); + +error_wb_free: + amdgpu_device_wb_free(adev, status_offset); + return r; } static int convert_to_mes_queue_type(int queue_type) From ed5a4484f074aa2bfb1dad99ff3628ea8da4acdc Mon Sep 17 00:00:00 2001 From: Likun Gao Date: Wed, 12 Jun 2024 14:30:40 +0800 Subject: [PATCH 198/272] drm/amdgpu: init TA fw for psp v14 Add support to init TA firmware for psp v14. Signed-off-by: Likun Gao Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/psp_v14_0.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v14_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v14_0.c index f08a32c186946..40b28298af301 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v14_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v14_0.c @@ -32,7 +32,9 @@ #include "mp/mp_14_0_2_sh_mask.h" MODULE_FIRMWARE("amdgpu/psp_14_0_2_sos.bin"); +MODULE_FIRMWARE("amdgpu/psp_14_0_2_ta.bin"); MODULE_FIRMWARE("amdgpu/psp_14_0_3_sos.bin"); +MODULE_FIRMWARE("amdgpu/psp_14_0_3_ta.bin"); /* For large FW files the time to complete can be very long */ #define USBC_PD_POLLING_LIMIT_S 240 @@ -64,6 +66,9 @@ static int psp_v14_0_init_microcode(struct psp_context *psp) case IP_VERSION(14, 0, 2): case IP_VERSION(14, 0, 3): err = psp_init_sos_microcode(psp, ucode_prefix); + if (err) + return err; + err = psp_init_ta_microcode(psp, ucode_prefix); if (err) return err; break; From f770a6e9a3d7a90f77863b51325614f37a57fef5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 12 Jun 2024 19:28:13 -0400 Subject: [PATCH 199/272] bcachefs: Fix initialization order for srcu barrier btree_iter_init() needs to happen before key_cache_init(), to initialize btree_trans_barrier Reported-by: syzbot+3cca837c2183f8f6fcaf@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 65e239d329157..635da5b3439cf 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -912,9 +912,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_fs_journal_init(&c->journal) ?: bch2_fs_replicas_init(c) ?: + bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_cache_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: - bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_interior_update_init(c) ?: bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_fs_btree_write_buffer_init(c) ?: From d47df4f616d523b4ef832d03ec28b2e6d838067b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 12 Jun 2024 19:51:15 -0400 Subject: [PATCH 200/272] bcachefs: Fix array-index-out-of-bounds We use 0 size arrays as markers, but ubsan doesn't know that - cast them to a pointer to fix the splat. Also, make sure this code gets tested a bit more. Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey.c | 2 +- fs/bcachefs/bkey_methods.c | 6 +++++- fs/bcachefs/bkey_methods.h | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index f46978e5cb7c6..94a1d1982fa88 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -1064,7 +1064,7 @@ void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) { const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; u8 *l = k->key_start; - u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; + u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1; while (l < h) { swap(*l, *h); diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index c2c3dae521865..bd32aac051921 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -398,8 +398,12 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, for (i = 0; i < nr_compat; i++) switch (!write ? i : nr_compat - 1 - i) { case 0: - if (big_endian != CPU_BIG_ENDIAN) + if (big_endian != CPU_BIG_ENDIAN) { + bch2_bkey_swab_key(f, k); + } else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { bch2_bkey_swab_key(f, k); + bch2_bkey_swab_key(f, k); + } break; case 1: if (version < bcachefs_metadata_version_bkey_renumber) diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 726ef74837638..baef0722f5fb6 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -129,7 +129,8 @@ static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, struct bkey_packed *k) { if (version < bcachefs_metadata_version_current || - big_endian != CPU_BIG_ENDIAN) + big_endian != CPU_BIG_ENDIAN || + IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) __bch2_bkey_compat(level, btree_id, version, big_endian, write, f, k); From 3727ca56049d893859b68f70e50092250de79f28 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 09:09:52 -0400 Subject: [PATCH 201/272] bcachefs: Fix a locking bug in the do_discard_fast() path We can't discard a bucket while it's still open; this needs the bucket_is_open_safe() version, which takes the open_buckets lock. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index c4b6601f5b748..d2241f2b40fed 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -882,7 +882,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, closure_wake_up(&c->freelist_wait); if (statechange(a->data_type == BCH_DATA_need_discard) && - !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && + !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) && bucket_flushed(new_a)) bch2_discard_one_bucket_fast(c, new.k->p); From d406545613b5c2716d5658038c46861863510b90 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 09:20:41 -0400 Subject: [PATCH 202/272] bcachefs: Fix shift overflow in read_one_super() Reported-by: syzbot+9f74cb4006b83e2a3df1@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 055478d21e9ef..b156fc85b8a3e 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -649,9 +649,10 @@ static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf bytes = vstruct_bytes(sb->sb); - if (bytes > 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits)) { - prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", - bytes, 512UL << sb->sb->layout.sb_max_size_bits); + u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits); + if (bytes > sb_size) { + prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)", + bytes, sb_size); return -BCH_ERR_invalid_sb_too_big; } From e3fd3faa453ce4cf4b6a0f3e29ee77d5d1b243a8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 09:28:01 -0400 Subject: [PATCH 203/272] bcachefs: Fix btree ID bitmasks these should be 64 bit bitmasks, not 32 bit. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 5 +++-- fs/bcachefs/btree_types.h | 16 ++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 90c12fe2a2cd3..5d3c5b5e34af8 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1382,9 +1382,10 @@ enum btree_id { /* * Maximum number of btrees that we will _ever_ have under the current scheme, - * where we refer to them with bitfields + * where we refer to them with 64 bit bitfields - and we also need a bit for + * the interior btree node type: */ -#define BTREE_ID_NR_MAX 64 +#define BTREE_ID_NR_MAX 63 static inline bool btree_id_is_alloc(enum btree_id id) { diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index d63db4fefe734..87f485e9c552d 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -761,13 +761,13 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type) static inline bool btree_node_type_is_extents(enum btree_node_type type) { - const unsigned mask = 0 + const u64 mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1)) BCH_BTREE_IDS() #undef x ; - return (1U << type) & mask; + return BIT_ULL(type) & mask; } static inline bool btree_id_is_extents(enum btree_id btree) @@ -777,35 +777,35 @@ static inline bool btree_id_is_extents(enum btree_id btree) static inline bool btree_type_has_snapshots(enum btree_id id) { - const unsigned mask = 0 + const u64 mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr) BCH_BTREE_IDS() #undef x ; - return (1U << id) & mask; + return BIT_ULL(id) & mask; } static inline bool btree_type_has_snapshot_field(enum btree_id id) { - const unsigned mask = 0 + const u64 mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr) BCH_BTREE_IDS() #undef x ; - return (1U << id) & mask; + return BIT_ULL(id) & mask; } static inline bool btree_type_has_ptrs(enum btree_id id) { - const unsigned mask = 0 + const u64 mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr) BCH_BTREE_IDS() #undef x ; - return (1U << id) & mask; + return BIT_ULL(id) & mask; } struct btree_root { From 9e7cfb35e2668e542c333ed3ec4b0a951dd332ee Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 09:26:54 -0400 Subject: [PATCH 204/272] bcachefs: Check for invalid btree IDs We can only handle btree IDs up to 62, since the btree id (plus the type for interior btree nodes) has to fit ito a 64 bit bitmask - check for invalid ones to avoid invalid shifts later. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 8 +++++++- fs/bcachefs/sb-errors_format.h | 6 +++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index cf513fc79ce48..e632da69196cc 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -326,6 +326,12 @@ static int journal_replay_entry_early(struct bch_fs *c, case BCH_JSET_ENTRY_btree_root: { struct btree_root *r; + if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX, + c, invalid_btree_id, + "invalid btree id %u (max %u)", + entry->btree_id, BTREE_ID_NR_MAX)) + return 0; + while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); if (ret) @@ -415,7 +421,7 @@ static int journal_replay_entry_early(struct bch_fs *c, atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); } } - +fsck_err: return ret; } diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 84d2763bd597b..1d1251f1bb205 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -273,7 +273,11 @@ x(sb_clean_entry_overrun, 267) \ x(btree_ptr_v2_written_0, 268) \ x(subvol_snapshot_bad, 269) \ - x(subvol_inode_bad, 270) + x(subvol_inode_bad, 270) \ + x(alloc_key_stripe_sectors_wrong, 271) \ + x(accounting_mismatch, 272) \ + x(accounting_replicas_not_marked, 273) \ + x(invalid_btree_id, 274) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, From dbf4d79b7fc7e9bf5d1546f6dfffd789ea061221 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 09:36:34 -0400 Subject: [PATCH 205/272] bcachefs: Fix early init error path in journal code We shouln't be running the journal shutdown sequence if we never fully initialized the journal. Reported-by: syzbot+ffd2270f0bca3322ee00@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index adec8e1ea73ea..dac2f498ae8b6 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1167,6 +1167,9 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) void bch2_fs_journal_stop(struct journal *j) { + if (!test_bit(JOURNAL_running, &j->flags)) + return; + bch2_journal_reclaim_stop(j); bch2_journal_flush_all_pins(j); From 1ba44217f8258f92c56644ca4fad4462f1941e33 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 09:51:01 -0400 Subject: [PATCH 206/272] bcachefs: delete_dead_snapshots() doesn't need to go RW We've been moving away from going RW lazily; if we want to go RW we do that in set_may_go_rw(), and if we didn't go RW we don't need to delete dead snapshots. Reported-by: syzbot+4366624c0b5aac4906cf@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 51918acfd7268..961b5f56358c8 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1565,13 +1565,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) return 0; - if (!test_bit(BCH_FS_started, &c->flags)) { - ret = bch2_fs_read_write_early(c); - bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); - if (ret) - return ret; - } - trans = bch2_trans_get(c); /* From cff07e2739d81cf33eb2a378a6136eced852b8cb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 10:06:03 -0400 Subject: [PATCH 207/272] bcachefs: Guard against overflowing LRU_TIME_BITS LRUs only have 48 bits for the time field (i.e. LRU order); thus we need overflow checks and guards. Reported-by: syzbot+df3bf3f088dcaa728857@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 22 +++++++++++++++------- fs/bcachefs/alloc_background.h | 8 +++++++- fs/bcachefs/bcachefs.h | 5 +++++ fs/bcachefs/bcachefs_format.h | 3 +++ fs/bcachefs/lru.h | 3 --- fs/bcachefs/sb-errors_format.h | 3 ++- 6 files changed, 32 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index d2241f2b40fed..e258de7045789 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -259,6 +259,14 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, "invalid data type (got %u should be %u)", a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); + for (unsigned i = 0; i < 2; i++) + bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX, + c, err, + alloc_key_io_time_bad, + "invalid io_time[%s]: %llu, max %llu", + i == READ ? "read" : "write", + a.v->io_time[i], LRU_TIME_MAX); + switch (a.v->data_type) { case BCH_DATA_free: case BCH_DATA_need_gc_gens: @@ -757,8 +765,8 @@ int bch2_trigger_alloc(struct btree_trans *trans, alloc_data_type_set(new_a, new_a->data_type); if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); + new_a->io_time[READ] = bch2_current_io_time(c, READ); + new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); } @@ -781,7 +789,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (new_a->data_type == BCH_DATA_cached && !new_a->io_time[READ]) - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + new_a->io_time[READ] = bch2_current_io_time(c, READ); u64 old_lru = alloc_lru_idx_read(*old_a); u64 new_lru = alloc_lru_idx_read(*new_a); @@ -1579,7 +1587,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (ret) goto err; - a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + a_mut->v.io_time[READ] = bch2_current_io_time(c, READ); ret = bch2_trans_update(trans, alloc_iter, &a_mut->k_i, BTREE_TRIGGER_norun); if (ret) @@ -1975,8 +1983,8 @@ static int invalidate_one_bucket(struct btree_trans *trans, a->v.data_type = 0; a->v.dirty_sectors = 0; a->v.cached_sectors = 0; - a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); - a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); + a->v.io_time[READ] = bch2_current_io_time(c, READ); + a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE); ret = bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| @@ -2204,7 +2212,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, if (ret) return ret; - now = atomic64_read(&c->io_clock[rw].now); + now = bch2_current_io_time(c, rw); if (a->v.io_time[rw] == now) goto out; diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index ae31a94be6f91..c3cc3c5ba5b63 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -141,7 +141,13 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, !bch2_bucket_sectors_fragmented(ca, a)) return 0; - u64 d = bch2_bucket_sectors_dirty(a); + /* + * avoid overflowing LRU_TIME_BITS on a corrupted fs, when + * bucket_sectors_dirty is (much) bigger than bucket_size + */ + u64 d = min(bch2_bucket_sectors_dirty(a), + ca->mi.bucket_size); + return div_u64(d * (1ULL << 31), ca->mi.bucket_size); } diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 2992a644d822c..a6b83ecab7ce5 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1214,6 +1214,11 @@ static inline s64 bch2_current_time(const struct bch_fs *c) return timespec_to_bch2_time(c, now); } +static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw) +{ + return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX); +} + static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) { struct stdio_redirect *stdio = c->stdio; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 5d3c5b5e34af8..4b98fed1ee9a4 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -476,6 +476,9 @@ struct bch_lru { #define LRU_ID_STRIPES (1U << 16) +#define LRU_TIME_BITS 48 +#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) + /* Optional/variable size superblock sections: */ struct bch_sb_field { diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index fb11ab0dd00ea..bd71ba77de078 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -2,9 +2,6 @@ #ifndef _BCACHEFS_LRU_H #define _BCACHEFS_LRU_H -#define LRU_TIME_BITS 48 -#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) - static inline u64 lru_pos_id(struct bpos pos) { return pos.inode >> LRU_TIME_BITS; diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 1d1251f1bb205..1768e5c49f999 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -277,7 +277,8 @@ x(alloc_key_stripe_sectors_wrong, 271) \ x(accounting_mismatch, 272) \ x(accounting_replicas_not_marked, 273) \ - x(invalid_btree_id, 274) + x(invalid_btree_id, 274) \ + x(alloc_key_io_time_bad, 275) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, From 2e9940d4a19507deb29b3e05571fcaaed88155e2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 13:15:16 -0400 Subject: [PATCH 208/272] bcachefs: Handle cached data LRU wraparound We only have 48 bits for the LRU time field, which is insufficient to prevent wraparound. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 46 ++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index e258de7045789..7b5909764d148 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2019,6 +2019,21 @@ static int invalidate_one_bucket(struct btree_trans *trans, goto out; } +static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter, + struct bch_dev *ca, bool *wrapped) +{ + struct bkey_s_c k; +again: + k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); + if (!k.k && !*wrapped) { + bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); + *wrapped = true; + goto again; + } + + return k; +} + static void bch2_do_invalidates_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); @@ -2032,12 +2047,33 @@ static void bch2_do_invalidates_work(struct work_struct *work) for_each_member_device(c, ca) { s64 nr_to_invalidate = should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); + struct btree_iter iter; + bool wrapped = false; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, + lru_pos(ca->dev_idx, 0, + ((bch2_current_io_time(c, READ) + U32_MAX) & + LRU_TIME_MAX)), 0); - ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, - lru_pos(ca->dev_idx, 0, 0), - lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), - BTREE_ITER_intent, k, - invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate)); + while (true) { + bch2_trans_begin(trans); + + struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + if (!k.k) + break; + + ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); + if (ret) + break; + + bch2_btree_iter_advance(&iter); + } + bch2_trans_iter_exit(trans, &iter); if (ret < 0) { bch2_dev_put(ca); From ddd118ab45e848b1956ef8c8ef84963a554b5b58 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 11:31:00 -0400 Subject: [PATCH 209/272] bcachefs: Fix bch2_sb_downgrade_update() Missing enum conversion Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-downgrade.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 3fb23e399ffb3..4710b61631f0f 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -228,7 +228,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c) dst = (void *) &darray_top(table); dst->version = cpu_to_le16(src->version); - dst->recovery_passes[0] = cpu_to_le64(src->recovery_passes); + dst->recovery_passes[0] = cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes)); dst->recovery_passes[1] = 0; dst->nr_errors = cpu_to_le16(src->nr_errors); for (unsigned i = 0; i < src->nr_errors; i++) From 0a2a507d404eebbc168e8b1264edf0ac8c6047b4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 19 Jun 2024 08:43:15 -0400 Subject: [PATCH 210/272] bcachefs: set_worker_desc() for delete_dead_snapshots this is long running - help users see what's going on Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 961b5f56358c8..4ef98e696673f 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1680,6 +1680,8 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); + bch2_delete_dead_snapshots(c); bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); } From a56da69799bd5f0c72bdc0fb64c3e3d8c1b1bb36 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 19 Jun 2024 15:47:44 -0400 Subject: [PATCH 211/272] bcachefs: Fix bch2_trans_put() reference: https://github.com/koverstreet/bcachefs/issues/692 trans->ref is the reference used by the cycle detector, which walks btree_trans objects of other threads to walk the graph of held locks and issue wakeups when an abort is required. We have to wait for the ref to go to 1 before freeing trans->paths or clearing trans->locking_wait.task. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 3694c600a3add..3a1419d178885 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3161,6 +3161,7 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) list_add_done: seqmutex_unlock(&c->btree_trans_lock); got_trans: + trans->ref.closure_get_happened = false; trans->c = c; trans->last_begin_time = local_clock(); trans->fn_idx = fn_idx; @@ -3235,7 +3236,6 @@ void bch2_trans_put(struct btree_trans *trans) trans_for_each_update(trans, i) __btree_path_put(trans->paths + i->path, true); trans->nr_updates = 0; - trans->locking_wait.task = NULL; check_btree_paths_leaked(trans); @@ -3256,6 +3256,13 @@ void bch2_trans_put(struct btree_trans *trans) if (unlikely(trans->journal_replay_not_finished)) bch2_journal_keys_put(c); + /* + * trans->ref protects trans->locking_wait.task, btree_paths arary; used + * by cycle detector + */ + closure_sync(&trans->ref); + trans->locking_wait.task = NULL; + unsigned long *paths_allocated = trans->paths_allocated; trans->paths_allocated = NULL; trans->paths = NULL; @@ -3273,8 +3280,6 @@ void bch2_trans_put(struct btree_trans *trans) trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); if (trans) { - closure_sync(&trans->ref); - seqmutex_lock(&c->btree_trans_lock); list_del(&trans->list); seqmutex_unlock(&c->btree_trans_lock); From 02a176d42a88805b3520148a4eee28b0760cd8c0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 18 Jun 2024 12:39:14 -0700 Subject: [PATCH 212/272] ipv6: bring NLM_DONE out to a separate recv() again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit under Fixes optimized the number of recv() calls needed during RTM_GETROUTE dumps, but we got multiple reports of applications hanging on recv() calls. Applications expect that a route dump will be terminated with a recv() reading an individual NLM_DONE message. Coalescing NLM_DONE is perfectly legal in netlink, but even tho reporters fixed the code in respective projects, chances are it will take time for those applications to get updated. So revert to old behavior (for now)? This is an IPv6 version of commit 460b0d33cf10 ("inet: bring NLM_DONE out to a separate recv() again"). Reported-by: Maciej Żenczykowski Link: https://lore.kernel.org/all/CANP3RGc1RG71oPEBXNx_WZFP9AyphJefdO4paczN92n__ds4ow@mail.gmail.com Reported-by: Stefano Brivio Link: https://lore.kernel.org/all/20240315124808.033ff58d@elisabeth Reported-by: Ilya Maximets Link: https://lore.kernel.org/all/02b50aae-f0e9-47a4-8365-a977a85975d3@ovn.org Fixes: 5fc68320c1fb ("ipv6: remove RTNL protection from inet6_dump_fib()") Tested-by: Ilya Maximets Link: https://lore.kernel.org/r/20240618193914.561782-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 6e57c03e3255f..83e4f9855ae12 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2514,7 +2514,8 @@ int __init fib6_init(void) goto out_kmem_cache_create; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, - inet6_dump_fib, RTNL_FLAG_DUMP_UNLOCKED); + inet6_dump_fib, RTNL_FLAG_DUMP_UNLOCKED | + RTNL_FLAG_DUMP_SPLIT_NLM_DONE); if (ret) goto out_unregister_subsys; From 74382aebc9035470ec4c789bdb0d09d8c14f261e Mon Sep 17 00:00:00 2001 From: Marcin Szycik Date: Tue, 18 Jun 2024 14:02:05 -0700 Subject: [PATCH 213/272] ice: Fix VSI list rule with ICE_SW_LKUP_LAST type Adding/updating VSI list rule, as well as allocating/freeing VSI list resource are called several times with type ICE_SW_LKUP_LAST, which fails because ice_update_vsi_list_rule() and ice_aq_alloc_free_vsi_list() consider it invalid. Allow calling these functions with ICE_SW_LKUP_LAST. This fixes at least one issue in switchdev mode, where the same rule with different action cannot be added, e.g.: tc filter add dev $PF1 ingress protocol arp prio 0 flower skip_sw \ dst_mac ff:ff:ff:ff:ff:ff action mirred egress redirect dev $VF1_PR tc filter add dev $PF1 ingress protocol arp prio 0 flower skip_sw \ dst_mac ff:ff:ff:ff:ff:ff action mirred egress redirect dev $VF2_PR Fixes: 0f94570d0cae ("ice: allow adding advanced rules") Suggested-by: Michal Swiatkowski Reviewed-by: Michal Swiatkowski Reviewed-by: Przemek Kitszel Signed-off-by: Marcin Szycik Reviewed-by: Jacob Keller Reviewed-by: Simon Horman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20240618210206.981885-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_switch.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index 94d6670d09013..1191031b2a43d 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -1899,7 +1899,8 @@ ice_aq_alloc_free_vsi_list(struct ice_hw *hw, u16 *vsi_list_id, lkup_type == ICE_SW_LKUP_ETHERTYPE_MAC || lkup_type == ICE_SW_LKUP_PROMISC || lkup_type == ICE_SW_LKUP_PROMISC_VLAN || - lkup_type == ICE_SW_LKUP_DFLT) { + lkup_type == ICE_SW_LKUP_DFLT || + lkup_type == ICE_SW_LKUP_LAST) { sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_VSI_LIST_REP); } else if (lkup_type == ICE_SW_LKUP_VLAN) { if (opc == ice_aqc_opc_alloc_res) @@ -2922,7 +2923,8 @@ ice_update_vsi_list_rule(struct ice_hw *hw, u16 *vsi_handle_arr, u16 num_vsi, lkup_type == ICE_SW_LKUP_ETHERTYPE_MAC || lkup_type == ICE_SW_LKUP_PROMISC || lkup_type == ICE_SW_LKUP_PROMISC_VLAN || - lkup_type == ICE_SW_LKUP_DFLT) + lkup_type == ICE_SW_LKUP_DFLT || + lkup_type == ICE_SW_LKUP_LAST) rule_type = remove ? ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR : ICE_AQC_SW_RULES_T_VSI_LIST_SET; else if (lkup_type == ICE_SW_LKUP_VLAN) From f9ae848904289ddb16c7c9e4553ed4c64300de49 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Wed, 19 Jun 2024 01:29:04 +0100 Subject: [PATCH 214/272] net/tcp_ao: Don't leak ao_info on error-path It seems I introduced it together with TCP_AO_CMDF_AO_REQUIRED, on version 5 [1] of TCP-AO patches. Quite frustrative that having all these selftests that I've written, running kmemtest & kcov was always in todo. [1]: https://lore.kernel.org/netdev/20230215183335.800122-5-dima@arista.com/ Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20240617072451.1403e1d2@kernel.org/ Fixes: 0aadc73995d0 ("net/tcp: Prevent TCP-MD5 with TCP-AO being set") Cc: stable@vger.kernel.org Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240619-tcp-ao-required-leak-v1-1-6408f3c94247@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ao.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index 37c42b63ff993..09c0fa6756b7d 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -1968,8 +1968,10 @@ static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family, first = true; } - if (cmd.ao_required && tcp_ao_required_verify(sk)) - return -EKEYREJECTED; + if (cmd.ao_required && tcp_ao_required_verify(sk)) { + err = -EKEYREJECTED; + goto out; + } /* For sockets in TCP_CLOSED it's possible set keys that aren't * matching the future peer (address/port/VRF/etc), From d21d44dbdde83c4a8553c95de1853e63e88d7954 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Mon, 17 Jun 2024 17:47:36 +0200 Subject: [PATCH 215/272] drm/xe/vf: Don't touch GuC irq registers if using memory irqs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On platforms where VFs are using memory based interrupts, we missed invalid access to no longer existing interrupt registers, as we keep them marked with XE_REG_OPTION_VF. To fix that just either setup memirq vectors in GuC or enable legacy interrupts. Fixes: aef4eb7c7dec ("drm/xe/vf: Setup memory based interrupts in GuC") Signed-off-by: Michal Wajdeczko Cc: Matt Roper Reviewed-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20240617154736.685-1-michal.wajdeczko@intel.com (cherry picked from commit f0ccd2d805e55e12b430d5d6b9acd9f891af455e) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_guc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c index 240e7a4bbff1a..5faca4fc2fef5 100644 --- a/drivers/gpu/drm/xe/xe_guc.c +++ b/drivers/gpu/drm/xe/xe_guc.c @@ -631,8 +631,6 @@ int xe_guc_enable_communication(struct xe_guc *guc) struct xe_device *xe = guc_to_xe(guc); int err; - guc_enable_irq(guc); - if (IS_SRIOV_VF(xe) && xe_device_has_memirq(xe)) { struct xe_gt *gt = guc_to_gt(guc); struct xe_tile *tile = gt_to_tile(gt); @@ -640,6 +638,8 @@ int xe_guc_enable_communication(struct xe_guc *guc) err = xe_memirq_init_guc(&tile->sriov.vf.memirq, guc); if (err) return err; + } else { + guc_enable_irq(guc); } xe_mmio_rmw32(guc_to_gt(guc), PMINTRMSK, From 9b1effff19cdf2230d3ecb07ff4038a0da32e9cc Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Wed, 19 Jun 2024 17:16:02 +0100 Subject: [PATCH 216/272] ALSA: hda: cs35l56: Select SERIAL_MULTI_INSTANTIATE The ACPI IDs used in the CS35L56 HDA drivers are all handled by the serial multi-instantiate driver which starts multiple Linux device instances from a single ACPI Device() node. As serial multi-instantiate is not an optional part of the system add it as a dependency in Kconfig so that it is not overlooked. Signed-off-by: Simon Trimmer Link: https://lore.kernel.org/20240619161602.117452-1-simont@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/Kconfig b/sound/pci/hda/Kconfig index 0da625533afc2..e59df40a0007a 100644 --- a/sound/pci/hda/Kconfig +++ b/sound/pci/hda/Kconfig @@ -162,6 +162,7 @@ config SND_HDA_SCODEC_CS35L56_I2C depends on ACPI || COMPILE_TEST depends on SND_SOC select FW_CS_DSP + select SERIAL_MULTI_INSTANTIATE select SND_HDA_GENERIC select SND_SOC_CS35L56_SHARED select SND_HDA_SCODEC_CS35L56 @@ -178,6 +179,7 @@ config SND_HDA_SCODEC_CS35L56_SPI depends on ACPI || COMPILE_TEST depends on SND_SOC select FW_CS_DSP + select SERIAL_MULTI_INSTANTIATE select SND_HDA_GENERIC select SND_SOC_CS35L56_SHARED select SND_HDA_SCODEC_CS35L56 From 6cd4a78d962bebbaf8beb7d2ead3f34120e3f7b2 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 17 Jun 2024 22:02:05 +0100 Subject: [PATCH 217/272] net: do not leave a dangling sk pointer, when socket creation fails It is possible to trigger a use-after-free by: * attaching an fentry probe to __sock_release() and the probe calling the bpf_get_socket_cookie() helper * running traceroute -I 1.1.1.1 on a freshly booted VM A KASAN enabled kernel will log something like below (decoded and stripped): ================================================================== BUG: KASAN: slab-use-after-free in __sock_gen_cookie (./arch/x86/include/asm/atomic64_64.h:15 ./include/linux/atomic/atomic-arch-fallback.h:2583 ./include/linux/atomic/atomic-instrumented.h:1611 net/core/sock_diag.c:29) Read of size 8 at addr ffff888007110dd8 by task traceroute/299 CPU: 2 PID: 299 Comm: traceroute Tainted: G E 6.10.0-rc2+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:117 (discriminator 1)) print_report (mm/kasan/report.c:378 mm/kasan/report.c:488) ? __sock_gen_cookie (./arch/x86/include/asm/atomic64_64.h:15 ./include/linux/atomic/atomic-arch-fallback.h:2583 ./include/linux/atomic/atomic-instrumented.h:1611 net/core/sock_diag.c:29) kasan_report (mm/kasan/report.c:603) ? __sock_gen_cookie (./arch/x86/include/asm/atomic64_64.h:15 ./include/linux/atomic/atomic-arch-fallback.h:2583 ./include/linux/atomic/atomic-instrumented.h:1611 net/core/sock_diag.c:29) kasan_check_range (mm/kasan/generic.c:183 mm/kasan/generic.c:189) __sock_gen_cookie (./arch/x86/include/asm/atomic64_64.h:15 ./include/linux/atomic/atomic-arch-fallback.h:2583 ./include/linux/atomic/atomic-instrumented.h:1611 net/core/sock_diag.c:29) bpf_get_socket_ptr_cookie (./arch/x86/include/asm/preempt.h:94 ./include/linux/sock_diag.h:42 net/core/filter.c:5094 net/core/filter.c:5092) bpf_prog_875642cf11f1d139___sock_release+0x6e/0x8e bpf_trampoline_6442506592+0x47/0xaf __sock_release (net/socket.c:652) __sock_create (net/socket.c:1601) ... Allocated by task 299 on cpu 2 at 78.328492s: kasan_save_stack (mm/kasan/common.c:48) kasan_save_track (mm/kasan/common.c:68) __kasan_slab_alloc (mm/kasan/common.c:312 mm/kasan/common.c:338) kmem_cache_alloc_noprof (mm/slub.c:3941 mm/slub.c:4000 mm/slub.c:4007) sk_prot_alloc (net/core/sock.c:2075) sk_alloc (net/core/sock.c:2134) inet_create (net/ipv4/af_inet.c:327 net/ipv4/af_inet.c:252) __sock_create (net/socket.c:1572) __sys_socket (net/socket.c:1660 net/socket.c:1644 net/socket.c:1706) __x64_sys_socket (net/socket.c:1718) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Freed by task 299 on cpu 2 at 78.328502s: kasan_save_stack (mm/kasan/common.c:48) kasan_save_track (mm/kasan/common.c:68) kasan_save_free_info (mm/kasan/generic.c:582) poison_slab_object (mm/kasan/common.c:242) __kasan_slab_free (mm/kasan/common.c:256) kmem_cache_free (mm/slub.c:4437 mm/slub.c:4511) __sk_destruct (net/core/sock.c:2117 net/core/sock.c:2208) inet_create (net/ipv4/af_inet.c:397 net/ipv4/af_inet.c:252) __sock_create (net/socket.c:1572) __sys_socket (net/socket.c:1660 net/socket.c:1644 net/socket.c:1706) __x64_sys_socket (net/socket.c:1718) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Fix this by clearing the struct socket reference in sk_common_release() to cover all protocol families create functions, which may already attached the reference to the sk object with sock_init_data(). Fixes: c5dbb89fc2ac ("bpf: Expose bpf_get_socket_cookie to tracing programs") Suggested-by: Kuniyuki Iwashima Signed-off-by: Ignat Korchagin Cc: stable@vger.kernel.org Link: https://lore.kernel.org/netdev/20240613194047.36478-1-kuniyu@amazon.com/T/ Reviewed-by: Kuniyuki Iwashima Reviewed-by: D. Wythe Link: https://lore.kernel.org/r/20240617210205.67311-1-ignat@cloudflare.com Signed-off-by: Paolo Abeni --- net/core/sock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/sock.c b/net/core/sock.c index 8629f9aecf91a..100e975073ca5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3742,6 +3742,9 @@ void sk_common_release(struct sock *sk) sk->sk_prot->unhash(sk); + if (sk->sk_socket) + sk->sk_socket->sk = NULL; + /* * In this point socket cannot receive new packets, but it is possible * that some packets are in flight because some CPU runs receiver and From 8851346912a1fa33e7a5966fe51f07313b274627 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 18 Jun 2024 09:38:21 +0200 Subject: [PATCH 218/272] net: stmmac: Assign configured channel value to EXTTS event Assign the configured channel value to the EXTTS event in the timestamp interrupt handler. Without assigning the correct channel, applications like ts2phc will refuse to accept the event, resulting in errors such as: ... ts2phc[656.834]: config item end1.ts2phc.pin_index is 0 ts2phc[656.834]: config item end1.ts2phc.channel is 3 ts2phc[656.834]: config item end1.ts2phc.extts_polarity is 2 ts2phc[656.834]: config item end1.ts2phc.extts_correction is 0 ... ts2phc[656.862]: extts on unexpected channel ts2phc[658.141]: extts on unexpected channel ts2phc[659.140]: extts on unexpected channel Fixes: f4da56529da60 ("net: stmmac: Add support for external trigger timestamping") Cc: stable@vger.kernel.org Signed-off-by: Oleksij Rempel Reviewed-by: Wojciech Drewek Link: https://lore.kernel.org/r/20240618073821.619751-1-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c index f05bd757dfe52..5ef52ef2698fb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c @@ -218,6 +218,7 @@ static void timestamp_interrupt(struct stmmac_priv *priv) { u32 num_snapshot, ts_status, tsync_int; struct ptp_clock_event event; + u32 acr_value, channel; unsigned long flags; u64 ptp_time; int i; @@ -243,12 +244,15 @@ static void timestamp_interrupt(struct stmmac_priv *priv) num_snapshot = (ts_status & GMAC_TIMESTAMP_ATSNS_MASK) >> GMAC_TIMESTAMP_ATSNS_SHIFT; + acr_value = readl(priv->ptpaddr + PTP_ACR); + channel = ilog2(FIELD_GET(PTP_ACR_MASK, acr_value)); + for (i = 0; i < num_snapshot; i++) { read_lock_irqsave(&priv->ptp_lock, flags); get_ptptime(priv->ptpaddr, &ptp_time); read_unlock_irqrestore(&priv->ptp_lock, flags); event.type = PTP_CLOCK_EXTTS; - event.index = 0; + event.index = channel; event.timestamp = ptp_time; ptp_clock_event(priv->ptp_clock, &event); } From a23800f08a60787dfbf2b87b2e6ed411cb629859 Mon Sep 17 00:00:00 2001 From: Chenliang Li Date: Wed, 19 Jun 2024 14:38:19 +0800 Subject: [PATCH 219/272] io_uring/rsrc: fix incorrect assignment of iter->nr_segs in io_import_fixed In io_import_fixed when advancing the iter within the first bvec, the iter->nr_segs is set to bvec->bv_len. nr_segs should be the number of bvecs, plus we don't need to adjust it here, so just remove it. Fixes: b000ae0ec2d7 ("io_uring/rsrc: optimise single entry advance") Signed-off-by: Chenliang Li Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20240619063819.2445-1-cliang01.li@samsung.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index edb9c5baf2e29..570bfa6a31aa9 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1068,7 +1068,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter, * branch doesn't expect non PAGE_SIZE'd chunks. */ iter->bvec = bvec; - iter->nr_segs = bvec->bv_len; iter->count -= offset; iter->iov_offset = offset; } else { From 33dfafa90285c0873a24d633877d505ab8e3fc20 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 19 Jun 2024 09:55:48 -0400 Subject: [PATCH 220/272] bcachefs: Fix safe errors by default i.e. the start of automatic self healing: If errors=continue or fix_safe, we now automatically fix simple errors without user intervention. New error action option: fix_safe This replaces the existing errors=ro option, which gets a new slot, i.e. existing errors=ro users now get errors=fix_safe. This is currently only enabled for a limited set of errors - initially just disk accounting; errors we would never not want to fix, and we don't want to require user intervention (i.e. to make sure a bug report gets filed). Errors will still be counted in the superblock, so we (developers) will still know they've been occuring if a bug report gets filed (as bug reports typically include the errors superblock section). Eventually we'll be enabling this for a much wider set of errors, after we've done thorough error injection testing. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 5 +- fs/bcachefs/error.c | 19 +- fs/bcachefs/error.h | 7 - fs/bcachefs/opts.h | 2 +- fs/bcachefs/sb-errors_format.h | 564 +++++++++++++++++---------------- 5 files changed, 308 insertions(+), 289 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 4b98fed1ee9a4..e3b1bde489c3b 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -990,8 +990,9 @@ enum bch_version_upgrade_opts { #define BCH_ERROR_ACTIONS() \ x(continue, 0) \ - x(ro, 1) \ - x(panic, 2) + x(fix_safe, 1) \ + x(panic, 2) \ + x(ro, 3) enum bch_error_actions { #define x(t, n) BCH_ON_ERROR_##t = n, diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index c66eeffcd7f2a..d95c40f1b6af8 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -15,6 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c) switch (c->opts.errors) { case BCH_ON_ERROR_continue: return false; + case BCH_ON_ERROR_fix_safe: case BCH_ON_ERROR_ro: if (bch2_fs_emergency_read_only(c)) bch_err(c, "inconsistency detected - emergency read only at journal seq %llu", @@ -191,6 +192,12 @@ static void prt_actioning(struct printbuf *out, const char *action) prt_str(out, "ing"); } +static const u8 fsck_flags_extra[] = { +#define x(t, n, flags) [BCH_FSCK_ERR_##t] = flags, + BCH_SB_ERRS() +#undef x +}; + int bch2_fsck_err(struct bch_fs *c, enum bch_fsck_flags flags, enum bch_sb_error_id err, @@ -203,6 +210,9 @@ int bch2_fsck_err(struct bch_fs *c, int ret = -BCH_ERR_fsck_ignore; const char *action_orig = "fix?", *action = action_orig; + if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) + flags |= fsck_flags_extra[err]; + if ((flags & FSCK_CAN_FIX) && test_bit(err, c->sb.errors_silent)) return -BCH_ERR_fsck_fix; @@ -265,7 +275,14 @@ int bch2_fsck_err(struct bch_fs *c, prt_printf(out, bch2_log_msg(c, "")); #endif - if (!test_bit(BCH_FS_fsck_running, &c->flags)) { + if ((flags & FSCK_CAN_FIX) && + (flags & FSCK_AUTOFIX) && + (c->opts.errors == BCH_ON_ERROR_continue || + c->opts.errors == BCH_ON_ERROR_fix_safe)) { + prt_str(out, ", "); + prt_actioning(out, action); + ret = -BCH_ERR_fsck_fix; + } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { prt_str(out, ", shutting down"); diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 36caedf72d89a..777711504c35c 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -108,13 +108,6 @@ struct fsck_err_state { char *last_msg; }; -enum bch_fsck_flags { - FSCK_CAN_FIX = 1 << 0, - FSCK_CAN_IGNORE = 1 << 1, - FSCK_NEED_FSCK = 1 << 2, - FSCK_NO_RATELIMIT = 1 << 3, -}; - #define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) __printf(4, 5) __cold diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 25530e0bb2f38..b197ec90d4cb0 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -137,7 +137,7 @@ enum fsck_err_opts { x(errors, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_error_actions), \ - BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \ + BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ NULL, "Action to take on filesystem error") \ x(metadata_replicas, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 1768e5c49f999..d6f35a99c4291 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -2,286 +2,294 @@ #ifndef _BCACHEFS_SB_ERRORS_FORMAT_H #define _BCACHEFS_SB_ERRORS_FORMAT_H -#define BCH_SB_ERRS() \ - x(clean_but_journal_not_empty, 0) \ - x(dirty_but_no_journal_entries, 1) \ - x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ - x(sb_clean_journal_seq_mismatch, 3) \ - x(sb_clean_btree_root_mismatch, 4) \ - x(sb_clean_missing, 5) \ - x(jset_unsupported_version, 6) \ - x(jset_unknown_csum, 7) \ - x(jset_last_seq_newer_than_seq, 8) \ - x(jset_past_bucket_end, 9) \ - x(jset_seq_blacklisted, 10) \ - x(journal_entries_missing, 11) \ - x(journal_entry_replicas_not_marked, 12) \ - x(journal_entry_past_jset_end, 13) \ - x(journal_entry_replicas_data_mismatch, 14) \ - x(journal_entry_bkey_u64s_0, 15) \ - x(journal_entry_bkey_past_end, 16) \ - x(journal_entry_bkey_bad_format, 17) \ - x(journal_entry_bkey_invalid, 18) \ - x(journal_entry_btree_root_bad_size, 19) \ - x(journal_entry_blacklist_bad_size, 20) \ - x(journal_entry_blacklist_v2_bad_size, 21) \ - x(journal_entry_blacklist_v2_start_past_end, 22) \ - x(journal_entry_usage_bad_size, 23) \ - x(journal_entry_data_usage_bad_size, 24) \ - x(journal_entry_clock_bad_size, 25) \ - x(journal_entry_clock_bad_rw, 26) \ - x(journal_entry_dev_usage_bad_size, 27) \ - x(journal_entry_dev_usage_bad_dev, 28) \ - x(journal_entry_dev_usage_bad_pad, 29) \ - x(btree_node_unreadable, 30) \ - x(btree_node_fault_injected, 31) \ - x(btree_node_bad_magic, 32) \ - x(btree_node_bad_seq, 33) \ - x(btree_node_unsupported_version, 34) \ - x(btree_node_bset_older_than_sb_min, 35) \ - x(btree_node_bset_newer_than_sb, 36) \ - x(btree_node_data_missing, 37) \ - x(btree_node_bset_after_end, 38) \ - x(btree_node_replicas_sectors_written_mismatch, 39) \ - x(btree_node_replicas_data_mismatch, 40) \ - x(bset_unknown_csum, 41) \ - x(bset_bad_csum, 42) \ - x(bset_past_end_of_btree_node, 43) \ - x(bset_wrong_sector_offset, 44) \ - x(bset_empty, 45) \ - x(bset_bad_seq, 46) \ - x(bset_blacklisted_journal_seq, 47) \ - x(first_bset_blacklisted_journal_seq, 48) \ - x(btree_node_bad_btree, 49) \ - x(btree_node_bad_level, 50) \ - x(btree_node_bad_min_key, 51) \ - x(btree_node_bad_max_key, 52) \ - x(btree_node_bad_format, 53) \ - x(btree_node_bkey_past_bset_end, 54) \ - x(btree_node_bkey_bad_format, 55) \ - x(btree_node_bad_bkey, 56) \ - x(btree_node_bkey_out_of_order, 57) \ - x(btree_root_bkey_invalid, 58) \ - x(btree_root_read_error, 59) \ - x(btree_root_bad_min_key, 60) \ - x(btree_root_bad_max_key, 61) \ - x(btree_node_read_error, 62) \ - x(btree_node_topology_bad_min_key, 63) \ - x(btree_node_topology_bad_max_key, 64) \ - x(btree_node_topology_overwritten_by_prev_node, 65) \ - x(btree_node_topology_overwritten_by_next_node, 66) \ - x(btree_node_topology_interior_node_empty, 67) \ - x(fs_usage_hidden_wrong, 68) \ - x(fs_usage_btree_wrong, 69) \ - x(fs_usage_data_wrong, 70) \ - x(fs_usage_cached_wrong, 71) \ - x(fs_usage_reserved_wrong, 72) \ - x(fs_usage_persistent_reserved_wrong, 73) \ - x(fs_usage_nr_inodes_wrong, 74) \ - x(fs_usage_replicas_wrong, 75) \ - x(dev_usage_buckets_wrong, 76) \ - x(dev_usage_sectors_wrong, 77) \ - x(dev_usage_fragmented_wrong, 78) \ - x(dev_usage_buckets_ec_wrong, 79) \ - x(bkey_version_in_future, 80) \ - x(bkey_u64s_too_small, 81) \ - x(bkey_invalid_type_for_btree, 82) \ - x(bkey_extent_size_zero, 83) \ - x(bkey_extent_size_greater_than_offset, 84) \ - x(bkey_size_nonzero, 85) \ - x(bkey_snapshot_nonzero, 86) \ - x(bkey_snapshot_zero, 87) \ - x(bkey_at_pos_max, 88) \ - x(bkey_before_start_of_btree_node, 89) \ - x(bkey_after_end_of_btree_node, 90) \ - x(bkey_val_size_nonzero, 91) \ - x(bkey_val_size_too_small, 92) \ - x(alloc_v1_val_size_bad, 93) \ - x(alloc_v2_unpack_error, 94) \ - x(alloc_v3_unpack_error, 95) \ - x(alloc_v4_val_size_bad, 96) \ - x(alloc_v4_backpointers_start_bad, 97) \ - x(alloc_key_data_type_bad, 98) \ - x(alloc_key_empty_but_have_data, 99) \ - x(alloc_key_dirty_sectors_0, 100) \ - x(alloc_key_data_type_inconsistency, 101) \ - x(alloc_key_to_missing_dev_bucket, 102) \ - x(alloc_key_cached_inconsistency, 103) \ - x(alloc_key_cached_but_read_time_zero, 104) \ - x(alloc_key_to_missing_lru_entry, 105) \ - x(alloc_key_data_type_wrong, 106) \ - x(alloc_key_gen_wrong, 107) \ - x(alloc_key_dirty_sectors_wrong, 108) \ - x(alloc_key_cached_sectors_wrong, 109) \ - x(alloc_key_stripe_wrong, 110) \ - x(alloc_key_stripe_redundancy_wrong, 111) \ - x(bucket_sector_count_overflow, 112) \ - x(bucket_metadata_type_mismatch, 113) \ - x(need_discard_key_wrong, 114) \ - x(freespace_key_wrong, 115) \ - x(freespace_hole_missing, 116) \ - x(bucket_gens_val_size_bad, 117) \ - x(bucket_gens_key_wrong, 118) \ - x(bucket_gens_hole_wrong, 119) \ - x(bucket_gens_to_invalid_dev, 120) \ - x(bucket_gens_to_invalid_buckets, 121) \ - x(bucket_gens_nonzero_for_invalid_buckets, 122) \ - x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ - x(need_discard_freespace_key_bad, 124) \ - x(backpointer_bucket_offset_wrong, 125) \ - x(backpointer_to_missing_device, 126) \ - x(backpointer_to_missing_alloc, 127) \ - x(backpointer_to_missing_ptr, 128) \ - x(lru_entry_at_time_0, 129) \ - x(lru_entry_to_invalid_bucket, 130) \ - x(lru_entry_bad, 131) \ - x(btree_ptr_val_too_big, 132) \ - x(btree_ptr_v2_val_too_big, 133) \ - x(btree_ptr_has_non_ptr, 134) \ - x(extent_ptrs_invalid_entry, 135) \ - x(extent_ptrs_no_ptrs, 136) \ - x(extent_ptrs_too_many_ptrs, 137) \ - x(extent_ptrs_redundant_crc, 138) \ - x(extent_ptrs_redundant_stripe, 139) \ - x(extent_ptrs_unwritten, 140) \ - x(extent_ptrs_written_and_unwritten, 141) \ - x(ptr_to_invalid_device, 142) \ - x(ptr_to_duplicate_device, 143) \ - x(ptr_after_last_bucket, 144) \ - x(ptr_before_first_bucket, 145) \ - x(ptr_spans_multiple_buckets, 146) \ - x(ptr_to_missing_backpointer, 147) \ - x(ptr_to_missing_alloc_key, 148) \ - x(ptr_to_missing_replicas_entry, 149) \ - x(ptr_to_missing_stripe, 150) \ - x(ptr_to_incorrect_stripe, 151) \ - x(ptr_gen_newer_than_bucket_gen, 152) \ - x(ptr_too_stale, 153) \ - x(stale_dirty_ptr, 154) \ - x(ptr_bucket_data_type_mismatch, 155) \ - x(ptr_cached_and_erasure_coded, 156) \ - x(ptr_crc_uncompressed_size_too_small, 157) \ - x(ptr_crc_csum_type_unknown, 158) \ - x(ptr_crc_compression_type_unknown, 159) \ - x(ptr_crc_redundant, 160) \ - x(ptr_crc_uncompressed_size_too_big, 161) \ - x(ptr_crc_nonce_mismatch, 162) \ - x(ptr_stripe_redundant, 163) \ - x(reservation_key_nr_replicas_invalid, 164) \ - x(reflink_v_refcount_wrong, 165) \ - x(reflink_p_to_missing_reflink_v, 166) \ - x(stripe_pos_bad, 167) \ - x(stripe_val_size_bad, 168) \ - x(stripe_sector_count_wrong, 169) \ - x(snapshot_tree_pos_bad, 170) \ - x(snapshot_tree_to_missing_snapshot, 171) \ - x(snapshot_tree_to_missing_subvol, 172) \ - x(snapshot_tree_to_wrong_subvol, 173) \ - x(snapshot_tree_to_snapshot_subvol, 174) \ - x(snapshot_pos_bad, 175) \ - x(snapshot_parent_bad, 176) \ - x(snapshot_children_not_normalized, 177) \ - x(snapshot_child_duplicate, 178) \ - x(snapshot_child_bad, 179) \ - x(snapshot_skiplist_not_normalized, 180) \ - x(snapshot_skiplist_bad, 181) \ - x(snapshot_should_not_have_subvol, 182) \ - x(snapshot_to_bad_snapshot_tree, 183) \ - x(snapshot_bad_depth, 184) \ - x(snapshot_bad_skiplist, 185) \ - x(subvol_pos_bad, 186) \ - x(subvol_not_master_and_not_snapshot, 187) \ - x(subvol_to_missing_root, 188) \ - x(subvol_root_wrong_bi_subvol, 189) \ - x(bkey_in_missing_snapshot, 190) \ - x(inode_pos_inode_nonzero, 191) \ - x(inode_pos_blockdev_range, 192) \ - x(inode_unpack_error, 193) \ - x(inode_str_hash_invalid, 194) \ - x(inode_v3_fields_start_bad, 195) \ - x(inode_snapshot_mismatch, 196) \ - x(inode_unlinked_but_clean, 197) \ - x(inode_unlinked_but_nlink_nonzero, 198) \ - x(inode_checksum_type_invalid, 199) \ - x(inode_compression_type_invalid, 200) \ - x(inode_subvol_root_but_not_dir, 201) \ - x(inode_i_size_dirty_but_clean, 202) \ - x(inode_i_sectors_dirty_but_clean, 203) \ - x(inode_i_sectors_wrong, 204) \ - x(inode_dir_wrong_nlink, 205) \ - x(inode_dir_multiple_links, 206) \ - x(inode_multiple_links_but_nlink_0, 207) \ - x(inode_wrong_backpointer, 208) \ - x(inode_wrong_nlink, 209) \ - x(inode_unreachable, 210) \ - x(deleted_inode_but_clean, 211) \ - x(deleted_inode_missing, 212) \ - x(deleted_inode_is_dir, 213) \ - x(deleted_inode_not_unlinked, 214) \ - x(extent_overlapping, 215) \ - x(extent_in_missing_inode, 216) \ - x(extent_in_non_reg_inode, 217) \ - x(extent_past_end_of_inode, 218) \ - x(dirent_empty_name, 219) \ - x(dirent_val_too_big, 220) \ - x(dirent_name_too_long, 221) \ - x(dirent_name_embedded_nul, 222) \ - x(dirent_name_dot_or_dotdot, 223) \ - x(dirent_name_has_slash, 224) \ - x(dirent_d_type_wrong, 225) \ - x(inode_bi_parent_wrong, 226) \ - x(dirent_in_missing_dir_inode, 227) \ - x(dirent_in_non_dir_inode, 228) \ - x(dirent_to_missing_inode, 229) \ - x(dirent_to_missing_subvol, 230) \ - x(dirent_to_itself, 231) \ - x(quota_type_invalid, 232) \ - x(xattr_val_size_too_small, 233) \ - x(xattr_val_size_too_big, 234) \ - x(xattr_invalid_type, 235) \ - x(xattr_name_invalid_chars, 236) \ - x(xattr_in_missing_inode, 237) \ - x(root_subvol_missing, 238) \ - x(root_dir_missing, 239) \ - x(root_inode_not_dir, 240) \ - x(dir_loop, 241) \ - x(hash_table_key_duplicate, 242) \ - x(hash_table_key_wrong_offset, 243) \ - x(unlinked_inode_not_on_deleted_list, 244) \ - x(reflink_p_front_pad_bad, 245) \ - x(journal_entry_dup_same_device, 246) \ - x(inode_bi_subvol_missing, 247) \ - x(inode_bi_subvol_wrong, 248) \ - x(inode_points_to_missing_dirent, 249) \ - x(inode_points_to_wrong_dirent, 250) \ - x(inode_bi_parent_nonzero, 251) \ - x(dirent_to_missing_parent_subvol, 252) \ - x(dirent_not_visible_in_parent_subvol, 253) \ - x(subvol_fs_path_parent_wrong, 254) \ - x(subvol_root_fs_path_parent_nonzero, 255) \ - x(subvol_children_not_set, 256) \ - x(subvol_children_bad, 257) \ - x(subvol_loop, 258) \ - x(subvol_unreachable, 259) \ - x(btree_node_bkey_bad_u64s, 260) \ - x(btree_node_topology_empty_interior_node, 261) \ - x(btree_ptr_v2_min_key_bad, 262) \ - x(btree_root_unreadable_and_scan_found_nothing, 263) \ - x(snapshot_node_missing, 264) \ - x(dup_backpointer_to_bad_csum_extent, 265) \ - x(btree_bitmap_not_marked, 266) \ - x(sb_clean_entry_overrun, 267) \ - x(btree_ptr_v2_written_0, 268) \ - x(subvol_snapshot_bad, 269) \ - x(subvol_inode_bad, 270) \ - x(alloc_key_stripe_sectors_wrong, 271) \ - x(accounting_mismatch, 272) \ - x(accounting_replicas_not_marked, 273) \ - x(invalid_btree_id, 274) \ - x(alloc_key_io_time_bad, 275) +enum bch_fsck_flags { + FSCK_CAN_FIX = 1 << 0, + FSCK_CAN_IGNORE = 1 << 1, + FSCK_NEED_FSCK = 1 << 2, + FSCK_NO_RATELIMIT = 1 << 3, + FSCK_AUTOFIX = 1 << 4, +}; + +#define BCH_SB_ERRS() \ + x(clean_but_journal_not_empty, 0, 0) \ + x(dirty_but_no_journal_entries, 1, 0) \ + x(dirty_but_no_journal_entries_post_drop_nonflushes, 2, 0) \ + x(sb_clean_journal_seq_mismatch, 3, 0) \ + x(sb_clean_btree_root_mismatch, 4, 0) \ + x(sb_clean_missing, 5, 0) \ + x(jset_unsupported_version, 6, 0) \ + x(jset_unknown_csum, 7, 0) \ + x(jset_last_seq_newer_than_seq, 8, 0) \ + x(jset_past_bucket_end, 9, 0) \ + x(jset_seq_blacklisted, 10, 0) \ + x(journal_entries_missing, 11, 0) \ + x(journal_entry_replicas_not_marked, 12, 0) \ + x(journal_entry_past_jset_end, 13, 0) \ + x(journal_entry_replicas_data_mismatch, 14, 0) \ + x(journal_entry_bkey_u64s_0, 15, 0) \ + x(journal_entry_bkey_past_end, 16, 0) \ + x(journal_entry_bkey_bad_format, 17, 0) \ + x(journal_entry_bkey_invalid, 18, 0) \ + x(journal_entry_btree_root_bad_size, 19, 0) \ + x(journal_entry_blacklist_bad_size, 20, 0) \ + x(journal_entry_blacklist_v2_bad_size, 21, 0) \ + x(journal_entry_blacklist_v2_start_past_end, 22, 0) \ + x(journal_entry_usage_bad_size, 23, 0) \ + x(journal_entry_data_usage_bad_size, 24, 0) \ + x(journal_entry_clock_bad_size, 25, 0) \ + x(journal_entry_clock_bad_rw, 26, 0) \ + x(journal_entry_dev_usage_bad_size, 27, 0) \ + x(journal_entry_dev_usage_bad_dev, 28, 0) \ + x(journal_entry_dev_usage_bad_pad, 29, 0) \ + x(btree_node_unreadable, 30, 0) \ + x(btree_node_fault_injected, 31, 0) \ + x(btree_node_bad_magic, 32, 0) \ + x(btree_node_bad_seq, 33, 0) \ + x(btree_node_unsupported_version, 34, 0) \ + x(btree_node_bset_older_than_sb_min, 35, 0) \ + x(btree_node_bset_newer_than_sb, 36, 0) \ + x(btree_node_data_missing, 37, 0) \ + x(btree_node_bset_after_end, 38, 0) \ + x(btree_node_replicas_sectors_written_mismatch, 39, 0) \ + x(btree_node_replicas_data_mismatch, 40, 0) \ + x(bset_unknown_csum, 41, 0) \ + x(bset_bad_csum, 42, 0) \ + x(bset_past_end_of_btree_node, 43, 0) \ + x(bset_wrong_sector_offset, 44, 0) \ + x(bset_empty, 45, 0) \ + x(bset_bad_seq, 46, 0) \ + x(bset_blacklisted_journal_seq, 47, 0) \ + x(first_bset_blacklisted_journal_seq, 48, 0) \ + x(btree_node_bad_btree, 49, 0) \ + x(btree_node_bad_level, 50, 0) \ + x(btree_node_bad_min_key, 51, 0) \ + x(btree_node_bad_max_key, 52, 0) \ + x(btree_node_bad_format, 53, 0) \ + x(btree_node_bkey_past_bset_end, 54, 0) \ + x(btree_node_bkey_bad_format, 55, 0) \ + x(btree_node_bad_bkey, 56, 0) \ + x(btree_node_bkey_out_of_order, 57, 0) \ + x(btree_root_bkey_invalid, 58, 0) \ + x(btree_root_read_error, 59, 0) \ + x(btree_root_bad_min_key, 60, 0) \ + x(btree_root_bad_max_key, 61, 0) \ + x(btree_node_read_error, 62, 0) \ + x(btree_node_topology_bad_min_key, 63, 0) \ + x(btree_node_topology_bad_max_key, 64, 0) \ + x(btree_node_topology_overwritten_by_prev_node, 65, 0) \ + x(btree_node_topology_overwritten_by_next_node, 66, 0) \ + x(btree_node_topology_interior_node_empty, 67, 0) \ + x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \ + x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \ + x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \ + x(fs_usage_cached_wrong, 71, FSCK_AUTOFIX) \ + x(fs_usage_reserved_wrong, 72, FSCK_AUTOFIX) \ + x(fs_usage_persistent_reserved_wrong, 73, FSCK_AUTOFIX) \ + x(fs_usage_nr_inodes_wrong, 74, FSCK_AUTOFIX) \ + x(fs_usage_replicas_wrong, 75, FSCK_AUTOFIX) \ + x(dev_usage_buckets_wrong, 76, FSCK_AUTOFIX) \ + x(dev_usage_sectors_wrong, 77, FSCK_AUTOFIX) \ + x(dev_usage_fragmented_wrong, 78, FSCK_AUTOFIX) \ + x(dev_usage_buckets_ec_wrong, 79, FSCK_AUTOFIX) \ + x(bkey_version_in_future, 80, 0) \ + x(bkey_u64s_too_small, 81, 0) \ + x(bkey_invalid_type_for_btree, 82, 0) \ + x(bkey_extent_size_zero, 83, 0) \ + x(bkey_extent_size_greater_than_offset, 84, 0) \ + x(bkey_size_nonzero, 85, 0) \ + x(bkey_snapshot_nonzero, 86, 0) \ + x(bkey_snapshot_zero, 87, 0) \ + x(bkey_at_pos_max, 88, 0) \ + x(bkey_before_start_of_btree_node, 89, 0) \ + x(bkey_after_end_of_btree_node, 90, 0) \ + x(bkey_val_size_nonzero, 91, 0) \ + x(bkey_val_size_too_small, 92, 0) \ + x(alloc_v1_val_size_bad, 93, 0) \ + x(alloc_v2_unpack_error, 94, 0) \ + x(alloc_v3_unpack_error, 95, 0) \ + x(alloc_v4_val_size_bad, 96, 0) \ + x(alloc_v4_backpointers_start_bad, 97, 0) \ + x(alloc_key_data_type_bad, 98, 0) \ + x(alloc_key_empty_but_have_data, 99, 0) \ + x(alloc_key_dirty_sectors_0, 100, 0) \ + x(alloc_key_data_type_inconsistency, 101, 0) \ + x(alloc_key_to_missing_dev_bucket, 102, 0) \ + x(alloc_key_cached_inconsistency, 103, 0) \ + x(alloc_key_cached_but_read_time_zero, 104, 0) \ + x(alloc_key_to_missing_lru_entry, 105, 0) \ + x(alloc_key_data_type_wrong, 106, FSCK_AUTOFIX) \ + x(alloc_key_gen_wrong, 107, FSCK_AUTOFIX) \ + x(alloc_key_dirty_sectors_wrong, 108, FSCK_AUTOFIX) \ + x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \ + x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \ + x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \ + x(bucket_sector_count_overflow, 112, 0) \ + x(bucket_metadata_type_mismatch, 113, 0) \ + x(need_discard_key_wrong, 114, 0) \ + x(freespace_key_wrong, 115, 0) \ + x(freespace_hole_missing, 116, 0) \ + x(bucket_gens_val_size_bad, 117, 0) \ + x(bucket_gens_key_wrong, 118, 0) \ + x(bucket_gens_hole_wrong, 119, 0) \ + x(bucket_gens_to_invalid_dev, 120, 0) \ + x(bucket_gens_to_invalid_buckets, 121, 0) \ + x(bucket_gens_nonzero_for_invalid_buckets, 122, 0) \ + x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ + x(need_discard_freespace_key_bad, 124, 0) \ + x(backpointer_bucket_offset_wrong, 125, 0) \ + x(backpointer_to_missing_device, 126, 0) \ + x(backpointer_to_missing_alloc, 127, 0) \ + x(backpointer_to_missing_ptr, 128, 0) \ + x(lru_entry_at_time_0, 129, 0) \ + x(lru_entry_to_invalid_bucket, 130, 0) \ + x(lru_entry_bad, 131, 0) \ + x(btree_ptr_val_too_big, 132, 0) \ + x(btree_ptr_v2_val_too_big, 133, 0) \ + x(btree_ptr_has_non_ptr, 134, 0) \ + x(extent_ptrs_invalid_entry, 135, 0) \ + x(extent_ptrs_no_ptrs, 136, 0) \ + x(extent_ptrs_too_many_ptrs, 137, 0) \ + x(extent_ptrs_redundant_crc, 138, 0) \ + x(extent_ptrs_redundant_stripe, 139, 0) \ + x(extent_ptrs_unwritten, 140, 0) \ + x(extent_ptrs_written_and_unwritten, 141, 0) \ + x(ptr_to_invalid_device, 142, 0) \ + x(ptr_to_duplicate_device, 143, 0) \ + x(ptr_after_last_bucket, 144, 0) \ + x(ptr_before_first_bucket, 145, 0) \ + x(ptr_spans_multiple_buckets, 146, 0) \ + x(ptr_to_missing_backpointer, 147, 0) \ + x(ptr_to_missing_alloc_key, 148, 0) \ + x(ptr_to_missing_replicas_entry, 149, 0) \ + x(ptr_to_missing_stripe, 150, 0) \ + x(ptr_to_incorrect_stripe, 151, 0) \ + x(ptr_gen_newer_than_bucket_gen, 152, 0) \ + x(ptr_too_stale, 153, 0) \ + x(stale_dirty_ptr, 154, 0) \ + x(ptr_bucket_data_type_mismatch, 155, 0) \ + x(ptr_cached_and_erasure_coded, 156, 0) \ + x(ptr_crc_uncompressed_size_too_small, 157, 0) \ + x(ptr_crc_csum_type_unknown, 158, 0) \ + x(ptr_crc_compression_type_unknown, 159, 0) \ + x(ptr_crc_redundant, 160, 0) \ + x(ptr_crc_uncompressed_size_too_big, 161, 0) \ + x(ptr_crc_nonce_mismatch, 162, 0) \ + x(ptr_stripe_redundant, 163, 0) \ + x(reservation_key_nr_replicas_invalid, 164, 0) \ + x(reflink_v_refcount_wrong, 165, 0) \ + x(reflink_p_to_missing_reflink_v, 166, 0) \ + x(stripe_pos_bad, 167, 0) \ + x(stripe_val_size_bad, 168, 0) \ + x(stripe_sector_count_wrong, 169, 0) \ + x(snapshot_tree_pos_bad, 170, 0) \ + x(snapshot_tree_to_missing_snapshot, 171, 0) \ + x(snapshot_tree_to_missing_subvol, 172, 0) \ + x(snapshot_tree_to_wrong_subvol, 173, 0) \ + x(snapshot_tree_to_snapshot_subvol, 174, 0) \ + x(snapshot_pos_bad, 175, 0) \ + x(snapshot_parent_bad, 176, 0) \ + x(snapshot_children_not_normalized, 177, 0) \ + x(snapshot_child_duplicate, 178, 0) \ + x(snapshot_child_bad, 179, 0) \ + x(snapshot_skiplist_not_normalized, 180, 0) \ + x(snapshot_skiplist_bad, 181, 0) \ + x(snapshot_should_not_have_subvol, 182, 0) \ + x(snapshot_to_bad_snapshot_tree, 183, 0) \ + x(snapshot_bad_depth, 184, 0) \ + x(snapshot_bad_skiplist, 185, 0) \ + x(subvol_pos_bad, 186, 0) \ + x(subvol_not_master_and_not_snapshot, 187, 0) \ + x(subvol_to_missing_root, 188, 0) \ + x(subvol_root_wrong_bi_subvol, 189, 0) \ + x(bkey_in_missing_snapshot, 190, 0) \ + x(inode_pos_inode_nonzero, 191, 0) \ + x(inode_pos_blockdev_range, 192, 0) \ + x(inode_unpack_error, 193, 0) \ + x(inode_str_hash_invalid, 194, 0) \ + x(inode_v3_fields_start_bad, 195, 0) \ + x(inode_snapshot_mismatch, 196, 0) \ + x(inode_unlinked_but_clean, 197, 0) \ + x(inode_unlinked_but_nlink_nonzero, 198, 0) \ + x(inode_checksum_type_invalid, 199, 0) \ + x(inode_compression_type_invalid, 200, 0) \ + x(inode_subvol_root_but_not_dir, 201, 0) \ + x(inode_i_size_dirty_but_clean, 202, 0) \ + x(inode_i_sectors_dirty_but_clean, 203, 0) \ + x(inode_i_sectors_wrong, 204, 0) \ + x(inode_dir_wrong_nlink, 205, 0) \ + x(inode_dir_multiple_links, 206, 0) \ + x(inode_multiple_links_but_nlink_0, 207, 0) \ + x(inode_wrong_backpointer, 208, 0) \ + x(inode_wrong_nlink, 209, 0) \ + x(inode_unreachable, 210, 0) \ + x(deleted_inode_but_clean, 211, 0) \ + x(deleted_inode_missing, 212, 0) \ + x(deleted_inode_is_dir, 213, 0) \ + x(deleted_inode_not_unlinked, 214, 0) \ + x(extent_overlapping, 215, 0) \ + x(extent_in_missing_inode, 216, 0) \ + x(extent_in_non_reg_inode, 217, 0) \ + x(extent_past_end_of_inode, 218, 0) \ + x(dirent_empty_name, 219, 0) \ + x(dirent_val_too_big, 220, 0) \ + x(dirent_name_too_long, 221, 0) \ + x(dirent_name_embedded_nul, 222, 0) \ + x(dirent_name_dot_or_dotdot, 223, 0) \ + x(dirent_name_has_slash, 224, 0) \ + x(dirent_d_type_wrong, 225, 0) \ + x(inode_bi_parent_wrong, 226, 0) \ + x(dirent_in_missing_dir_inode, 227, 0) \ + x(dirent_in_non_dir_inode, 228, 0) \ + x(dirent_to_missing_inode, 229, 0) \ + x(dirent_to_missing_subvol, 230, 0) \ + x(dirent_to_itself, 231, 0) \ + x(quota_type_invalid, 232, 0) \ + x(xattr_val_size_too_small, 233, 0) \ + x(xattr_val_size_too_big, 234, 0) \ + x(xattr_invalid_type, 235, 0) \ + x(xattr_name_invalid_chars, 236, 0) \ + x(xattr_in_missing_inode, 237, 0) \ + x(root_subvol_missing, 238, 0) \ + x(root_dir_missing, 239, 0) \ + x(root_inode_not_dir, 240, 0) \ + x(dir_loop, 241, 0) \ + x(hash_table_key_duplicate, 242, 0) \ + x(hash_table_key_wrong_offset, 243, 0) \ + x(unlinked_inode_not_on_deleted_list, 244, 0) \ + x(reflink_p_front_pad_bad, 245, 0) \ + x(journal_entry_dup_same_device, 246, 0) \ + x(inode_bi_subvol_missing, 247, 0) \ + x(inode_bi_subvol_wrong, 248, 0) \ + x(inode_points_to_missing_dirent, 249, 0) \ + x(inode_points_to_wrong_dirent, 250, 0) \ + x(inode_bi_parent_nonzero, 251, 0) \ + x(dirent_to_missing_parent_subvol, 252, 0) \ + x(dirent_not_visible_in_parent_subvol, 253, 0) \ + x(subvol_fs_path_parent_wrong, 254, 0) \ + x(subvol_root_fs_path_parent_nonzero, 255, 0) \ + x(subvol_children_not_set, 256, 0) \ + x(subvol_children_bad, 257, 0) \ + x(subvol_loop, 258, 0) \ + x(subvol_unreachable, 259, 0) \ + x(btree_node_bkey_bad_u64s, 260, 0) \ + x(btree_node_topology_empty_interior_node, 261, 0) \ + x(btree_ptr_v2_min_key_bad, 262, 0) \ + x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ + x(snapshot_node_missing, 264, 0) \ + x(dup_backpointer_to_bad_csum_extent, 265, 0) \ + x(btree_bitmap_not_marked, 266, 0) \ + x(sb_clean_entry_overrun, 267, 0) \ + x(btree_ptr_v2_written_0, 268, 0) \ + x(subvol_snapshot_bad, 269, 0) \ + x(subvol_inode_bad, 270, 0) \ + x(alloc_key_stripe_sectors_wrong, 271, 0) \ + x(accounting_mismatch, 272, 0) \ + x(accounting_replicas_not_marked, 273, 0) \ + x(invalid_btree_id, 274, 0) \ + x(alloc_key_io_time_bad, 275, 0) enum bch_sb_error_id { -#define x(t, n) BCH_FSCK_ERR_##t = n, +#define x(t, n, ...) BCH_FSCK_ERR_##t = n, BCH_SB_ERRS() #undef x BCH_SB_ERR_MAX From c6cab97cdfd14571a17b9453b1d339eaa3b77c0b Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Thu, 20 Jun 2024 09:22:42 +0800 Subject: [PATCH 221/272] bcachefs: fix alignment of VMA for memory mapped files on THP With CONFIG_READ_ONLY_THP_FOR_FS, the Linux kernel supports using THPs for read-only mmapped files, such as shared libraries. However, the kernel makes no attempt to actually align those mappings on 2MB boundaries, which makes it impossible to use those THPs most of the time. This issue applies to general file mapping THP as well as existing setups using CONFIG_READ_ONLY_THP_FOR_FS. This is easily fixed by using thp_get_unmapped_area for the unmapped_area function in bcachefs, which is what ext2, ext4, fuse, xfs and btrfs all use. Similar to commit b0c582233a85 ("btrfs: fix alignment of VMA for memory mapped files on THP"). Signed-off-by: Youling Tang Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 77126992dba8c..8314d3e1582d3 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1157,6 +1157,7 @@ static const struct file_operations bch_file_operations = { .read_iter = bch2_read_iter, .write_iter = bch2_write_iter, .mmap = bch2_mmap, + .get_unmapped_area = thp_get_unmapped_area, .fsync = bch2_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, From 8ad04409921f4c405859a651c9a9e5ff5eb5e8a9 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 18 Jun 2024 14:53:11 -0700 Subject: [PATCH 222/272] bnxt_en: Update firmware interface to 1.10.3.44 The relevant change is the max_tso_segs value returned by firmware in the HWRM_FUNC_QCAPS response. This value will be used in the next patch to cap the TSO segments. Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240618215313.29631-2-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h | 311 ++++++++++-------- 1 file changed, 178 insertions(+), 133 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h index 06ea86c80be18..f219709f95635 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h @@ -2,7 +2,7 @@ * * Copyright (c) 2014-2016 Broadcom Corporation * Copyright (c) 2014-2018 Broadcom Limited - * Copyright (c) 2018-2023 Broadcom Inc. + * Copyright (c) 2018-2024 Broadcom Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -500,7 +500,11 @@ struct cmd_nums { #define HWRM_TFC_IF_TBL_GET 0x399UL #define HWRM_TFC_TBL_SCOPE_CONFIG_GET 0x39aUL #define HWRM_TFC_RESC_USAGE_QUERY 0x39bUL + #define HWRM_QUEUE_PFCWD_TIMEOUT_QCAPS 0x39cUL + #define HWRM_QUEUE_PFCWD_TIMEOUT_CFG 0x39dUL + #define HWRM_QUEUE_PFCWD_TIMEOUT_QCFG 0x39eUL #define HWRM_SV 0x400UL + #define HWRM_DBG_LOG_BUFFER_FLUSH 0xff0fUL #define HWRM_DBG_READ_DIRECT 0xff10UL #define HWRM_DBG_READ_INDIRECT 0xff11UL #define HWRM_DBG_WRITE_DIRECT 0xff12UL @@ -609,8 +613,8 @@ struct hwrm_err_output { #define HWRM_VERSION_MAJOR 1 #define HWRM_VERSION_MINOR 10 #define HWRM_VERSION_UPDATE 3 -#define HWRM_VERSION_RSVD 39 -#define HWRM_VERSION_STR "1.10.3.39" +#define HWRM_VERSION_RSVD 44 +#define HWRM_VERSION_STR "1.10.3.44" /* hwrm_ver_get_input (size:192b/24B) */ struct hwrm_ver_get_input { @@ -664,6 +668,7 @@ struct hwrm_ver_get_output { #define VER_GET_RESP_DEV_CAPS_CFG_CFA_TFLIB_SUPPORTED 0x2000UL #define VER_GET_RESP_DEV_CAPS_CFG_CFA_TRUFLOW_SUPPORTED 0x4000UL #define VER_GET_RESP_DEV_CAPS_CFG_SECURE_BOOT_CAPABLE 0x8000UL + #define VER_GET_RESP_DEV_CAPS_CFG_SECURE_SOC_CAPABLE 0x10000UL u8 roce_fw_maj_8b; u8 roce_fw_min_8b; u8 roce_fw_bld_8b; @@ -843,7 +848,9 @@ struct hwrm_async_event_cmpl { #define ASYNC_EVENT_CMPL_EVENT_ID_HW_DOORBELL_RECOVERY_READ_ERROR 0x49UL #define ASYNC_EVENT_CMPL_EVENT_ID_CTX_ERROR 0x4aUL #define ASYNC_EVENT_CMPL_EVENT_ID_UDCC_SESSION_CHANGE 0x4bUL - #define ASYNC_EVENT_CMPL_EVENT_ID_MAX_RGTR_EVENT_ID 0x4cUL + #define ASYNC_EVENT_CMPL_EVENT_ID_DBG_BUF_PRODUCER 0x4cUL + #define ASYNC_EVENT_CMPL_EVENT_ID_PEER_MMAP_CHANGE 0x4dUL + #define ASYNC_EVENT_CMPL_EVENT_ID_MAX_RGTR_EVENT_ID 0x4eUL #define ASYNC_EVENT_CMPL_EVENT_ID_FW_TRACE_MSG 0xfeUL #define ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR 0xffUL #define ASYNC_EVENT_CMPL_EVENT_ID_LAST ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR @@ -1326,13 +1333,13 @@ struct hwrm_async_event_cmpl_error_report_base { u8 timestamp_lo; __le16 timestamp_hi; __le32 event_data1; - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_MASK 0xffUL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_SFT 0 - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_RESERVED 0x0UL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_PAUSE_STORM 0x1UL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_INVALID_SIGNAL 0x2UL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_NVM 0x3UL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD 0x4UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_MASK 0xffUL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_SFT 0 + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_RESERVED 0x0UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_PAUSE_STORM 0x1UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_INVALID_SIGNAL 0x2UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_NVM 0x3UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD 0x4UL #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_THERMAL_THRESHOLD 0x5UL #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DUAL_DATA_RATE_NOT_SUPPORTED 0x6UL #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_LAST ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DUAL_DATA_RATE_NOT_SUPPORTED @@ -1814,6 +1821,9 @@ struct hwrm_func_qcaps_output { #define FUNC_QCAPS_RESP_FLAGS_EXT2_SW_MAX_RESOURCE_LIMITS_SUPPORTED 0x800000UL #define FUNC_QCAPS_RESP_FLAGS_EXT2_TF_INGRESS_NIC_FLOW_SUPPORTED 0x1000000UL #define FUNC_QCAPS_RESP_FLAGS_EXT2_LPBK_STATS_SUPPORTED 0x2000000UL + #define FUNC_QCAPS_RESP_FLAGS_EXT2_TF_EGRESS_NIC_FLOW_SUPPORTED 0x4000000UL + #define FUNC_QCAPS_RESP_FLAGS_EXT2_MULTI_LOSSLESS_QUEUES_SUPPORTED 0x8000000UL + #define FUNC_QCAPS_RESP_FLAGS_EXT2_PEER_MMAP_SUPPORTED 0x10000000UL __le16 tunnel_disable_flag; #define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_VXLAN 0x1UL #define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_NGE 0x2UL @@ -1828,7 +1838,7 @@ struct hwrm_func_qcaps_output { #define FUNC_QCAPS_RESP_XID_PARTITION_CAP_RX_CK 0x2UL u8 device_serial_number[8]; __le16 ctxs_per_partition; - u8 unused_2[2]; + __le16 max_tso_segs; __le32 roce_vf_max_av; __le32 roce_vf_max_cq; __le32 roce_vf_max_mrw; @@ -2449,6 +2459,7 @@ struct hwrm_func_drv_rgtr_input { #define FUNC_DRV_RGTR_REQ_FLAGS_NPAR_1_2_SUPPORT 0x200UL #define FUNC_DRV_RGTR_REQ_FLAGS_ASYM_QUEUE_CFG_SUPPORT 0x400UL #define FUNC_DRV_RGTR_REQ_FLAGS_TF_INGRESS_NIC_FLOW_MODE 0x800UL + #define FUNC_DRV_RGTR_REQ_FLAGS_TF_EGRESS_NIC_FLOW_MODE 0x1000UL __le32 enables; #define FUNC_DRV_RGTR_REQ_ENABLES_OS_TYPE 0x1UL #define FUNC_DRV_RGTR_REQ_ENABLES_VER 0x2UL @@ -3660,22 +3671,24 @@ struct hwrm_func_backing_store_cfg_v2_input { __le16 target_id; __le64 resp_addr; __le16 type; - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_QP 0x0UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRQ 0x1UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CQ 0x2UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_VNIC 0x3UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_STAT 0x4UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SP_TQM_RING 0x5UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_FP_TQM_RING 0x6UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_MRAV 0xeUL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TIM 0xfUL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_MP_TQM_RING 0x15UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SQ_DB_SHADOW 0x16UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RQ_DB_SHADOW 0x17UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRQ_DB_SHADOW 0x18UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CQ_DB_SHADOW 0x19UL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TBL_SCOPE 0x1cUL - #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_XID_PARTITION 0x1dUL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_QP 0x0UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRQ 0x1UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CQ 0x2UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_VNIC 0x3UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_STAT 0x4UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SP_TQM_RING 0x5UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_FP_TQM_RING 0x6UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_MRAV 0xeUL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TIM 0xfUL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TX_CK 0x13UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RX_CK 0x14UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_MP_TQM_RING 0x15UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SQ_DB_SHADOW 0x16UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RQ_DB_SHADOW 0x17UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRQ_DB_SHADOW 0x18UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CQ_DB_SHADOW 0x19UL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TBL_SCOPE 0x1cUL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_XID_PARTITION 0x1dUL #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRT_TRACE 0x1eUL #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRT2_TRACE 0x1fUL #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CRT_TRACE 0x20UL @@ -3772,18 +3785,20 @@ struct hwrm_func_backing_store_qcfg_v2_output { __le16 seq_id; __le16 resp_len; __le16 type; - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_QP 0x0UL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SRQ 0x1UL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CQ 0x2UL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_VNIC 0x3UL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_STAT 0x4UL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SP_TQM_RING 0x5UL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_FP_TQM_RING 0x6UL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_MRAV 0xeUL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TIM 0xfUL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_MP_TQM_RING 0x15UL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TBL_SCOPE 0x1cUL - #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_XID_PARTITION 0x1dUL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_QP 0x0UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SRQ 0x1UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CQ 0x2UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_VNIC 0x3UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_STAT 0x4UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SP_TQM_RING 0x5UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_FP_TQM_RING 0x6UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_MRAV 0xeUL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TIM 0xfUL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TX_CK 0x13UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_RX_CK 0x14UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_MP_TQM_RING 0x15UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TBL_SCOPE 0x1cUL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_XID_PARTITION 0x1dUL #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SRT_TRACE 0x1eUL #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SRT2_TRACE 0x1fUL #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CRT_TRACE 0x20UL @@ -3876,22 +3891,24 @@ struct hwrm_func_backing_store_qcaps_v2_input { __le16 target_id; __le64 resp_addr; __le16 type; - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_QP 0x0UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRQ 0x1UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CQ 0x2UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_VNIC 0x3UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_STAT 0x4UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SP_TQM_RING 0x5UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_FP_TQM_RING 0x6UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_MRAV 0xeUL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TIM 0xfUL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_MP_TQM_RING 0x15UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SQ_DB_SHADOW 0x16UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RQ_DB_SHADOW 0x17UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRQ_DB_SHADOW 0x18UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CQ_DB_SHADOW 0x19UL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TBL_SCOPE 0x1cUL - #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_XID_PARTITION 0x1dUL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_QP 0x0UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRQ 0x1UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CQ 0x2UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_VNIC 0x3UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_STAT 0x4UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SP_TQM_RING 0x5UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_FP_TQM_RING 0x6UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_MRAV 0xeUL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TIM 0xfUL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TX_CK 0x13UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RX_CK 0x14UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_MP_TQM_RING 0x15UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SQ_DB_SHADOW 0x16UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RQ_DB_SHADOW 0x17UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRQ_DB_SHADOW 0x18UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CQ_DB_SHADOW 0x19UL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TBL_SCOPE 0x1cUL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_XID_PARTITION 0x1dUL #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRT_TRACE 0x1eUL #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRT2_TRACE 0x1fUL #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CRT_TRACE 0x20UL @@ -3911,22 +3928,24 @@ struct hwrm_func_backing_store_qcaps_v2_output { __le16 seq_id; __le16 resp_len; __le16 type; - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_QP 0x0UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRQ 0x1UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CQ 0x2UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_VNIC 0x3UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_STAT 0x4UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SP_TQM_RING 0x5UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_FP_TQM_RING 0x6UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_MRAV 0xeUL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TIM 0xfUL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_MP_TQM_RING 0x15UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SQ_DB_SHADOW 0x16UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RQ_DB_SHADOW 0x17UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRQ_DB_SHADOW 0x18UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CQ_DB_SHADOW 0x19UL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TBL_SCOPE 0x1cUL - #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_XID_PARTITION 0x1dUL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_QP 0x0UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRQ 0x1UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CQ 0x2UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_VNIC 0x3UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_STAT 0x4UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SP_TQM_RING 0x5UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_FP_TQM_RING 0x6UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_MRAV 0xeUL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TIM 0xfUL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TX_CK 0x13UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RX_CK 0x14UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_MP_TQM_RING 0x15UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SQ_DB_SHADOW 0x16UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RQ_DB_SHADOW 0x17UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRQ_DB_SHADOW 0x18UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CQ_DB_SHADOW 0x19UL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TBL_SCOPE 0x1cUL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_XID_PARTITION 0x1dUL #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRT_TRACE 0x1eUL #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRT2_TRACE 0x1fUL #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CRT_TRACE 0x20UL @@ -4202,7 +4221,8 @@ struct hwrm_port_phy_cfg_input { #define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_100GB_PAM4_112 0x3eaUL #define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_200GB_PAM4_112 0x7d2UL #define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_400GB_PAM4_112 0xfa2UL - #define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_LAST PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_400GB_PAM4_112 + #define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_800GB_PAM4_112 0x1f42UL + #define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_LAST PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_800GB_PAM4_112 __le16 auto_link_speeds2_mask; #define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_1GB 0x1UL #define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_10GB 0x2UL @@ -4217,6 +4237,7 @@ struct hwrm_port_phy_cfg_input { #define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_100GB_PAM4_112 0x400UL #define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_200GB_PAM4_112 0x800UL #define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_400GB_PAM4_112 0x1000UL + #define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_800GB_PAM4_112 0x2000UL u8 unused_2[6]; }; @@ -4292,6 +4313,7 @@ struct hwrm_port_phy_qcfg_output { #define PORT_PHY_QCFG_RESP_LINK_SPEED_100GB 0x3e8UL #define PORT_PHY_QCFG_RESP_LINK_SPEED_200GB 0x7d0UL #define PORT_PHY_QCFG_RESP_LINK_SPEED_400GB 0xfa0UL + #define PORT_PHY_QCFG_RESP_LINK_SPEED_800GB 0x1f40UL #define PORT_PHY_QCFG_RESP_LINK_SPEED_10MB 0xffffUL #define PORT_PHY_QCFG_RESP_LINK_SPEED_LAST PORT_PHY_QCFG_RESP_LINK_SPEED_10MB u8 duplex_cfg; @@ -4451,7 +4473,13 @@ struct hwrm_port_phy_qcfg_output { #define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASESR4 0x35UL #define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASELR4 0x36UL #define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASEER4 0x37UL - #define PORT_PHY_QCFG_RESP_PHY_TYPE_LAST PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASEER4 + #define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASECR8 0x38UL + #define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASESR8 0x39UL + #define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASELR8 0x3aUL + #define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASEER8 0x3bUL + #define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASEFR8 0x3cUL + #define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASEDR8 0x3dUL + #define PORT_PHY_QCFG_RESP_PHY_TYPE_LAST PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASEDR8 u8 media_type; #define PORT_PHY_QCFG_RESP_MEDIA_TYPE_UNKNOWN 0x0UL #define PORT_PHY_QCFG_RESP_MEDIA_TYPE_TP 0x1UL @@ -5049,33 +5077,43 @@ struct hwrm_port_qstats_ext_output { u8 valid; }; -/* hwrm_port_lpbk_qstats_input (size:128b/16B) */ +/* hwrm_port_lpbk_qstats_input (size:256b/32B) */ struct hwrm_port_lpbk_qstats_input { __le16 req_type; __le16 cmpl_ring; __le16 seq_id; __le16 target_id; __le64 resp_addr; + __le16 lpbk_stat_size; + u8 flags; + #define PORT_LPBK_QSTATS_REQ_FLAGS_COUNTER_MASK 0x1UL + u8 unused_0[5]; + __le64 lpbk_stat_host_addr; }; -/* hwrm_port_lpbk_qstats_output (size:768b/96B) */ +/* hwrm_port_lpbk_qstats_output (size:128b/16B) */ struct hwrm_port_lpbk_qstats_output { __le16 error_code; __le16 req_type; __le16 seq_id; __le16 resp_len; + __le16 lpbk_stat_size; + u8 unused_0[5]; + u8 valid; +}; + +/* port_lpbk_stats (size:640b/80B) */ +struct port_lpbk_stats { __le64 lpbk_ucast_frames; __le64 lpbk_mcast_frames; __le64 lpbk_bcast_frames; __le64 lpbk_ucast_bytes; __le64 lpbk_mcast_bytes; __le64 lpbk_bcast_bytes; - __le64 tx_stat_discard; - __le64 tx_stat_error; - __le64 rx_stat_discard; - __le64 rx_stat_error; - u8 unused_0[7]; - u8 valid; + __le64 lpbk_tx_discards; + __le64 lpbk_tx_errors; + __le64 lpbk_rx_discards; + __le64 lpbk_rx_errors; }; /* hwrm_port_ecn_qstats_input (size:256b/32B) */ @@ -5140,13 +5178,15 @@ struct hwrm_port_clr_stats_output { u8 valid; }; -/* hwrm_port_lpbk_clr_stats_input (size:128b/16B) */ +/* hwrm_port_lpbk_clr_stats_input (size:192b/24B) */ struct hwrm_port_lpbk_clr_stats_input { __le16 req_type; __le16 cmpl_ring; __le16 seq_id; __le16 target_id; __le64 resp_addr; + __le16 port_id; + u8 unused_0[6]; }; /* hwrm_port_lpbk_clr_stats_output (size:128b/16B) */ @@ -5287,10 +5327,11 @@ struct hwrm_port_phy_qcaps_output { #define PORT_PHY_QCAPS_RESP_SUPPORTED_PAM4_SPEEDS_FORCE_MODE_100G 0x2UL #define PORT_PHY_QCAPS_RESP_SUPPORTED_PAM4_SPEEDS_FORCE_MODE_200G 0x4UL __le16 flags2; - #define PORT_PHY_QCAPS_RESP_FLAGS2_PAUSE_UNSUPPORTED 0x1UL - #define PORT_PHY_QCAPS_RESP_FLAGS2_PFC_UNSUPPORTED 0x2UL - #define PORT_PHY_QCAPS_RESP_FLAGS2_BANK_ADDR_SUPPORTED 0x4UL - #define PORT_PHY_QCAPS_RESP_FLAGS2_SPEEDS2_SUPPORTED 0x8UL + #define PORT_PHY_QCAPS_RESP_FLAGS2_PAUSE_UNSUPPORTED 0x1UL + #define PORT_PHY_QCAPS_RESP_FLAGS2_PFC_UNSUPPORTED 0x2UL + #define PORT_PHY_QCAPS_RESP_FLAGS2_BANK_ADDR_SUPPORTED 0x4UL + #define PORT_PHY_QCAPS_RESP_FLAGS2_SPEEDS2_SUPPORTED 0x8UL + #define PORT_PHY_QCAPS_RESP_FLAGS2_REMOTE_LPBK_UNSUPPORTED 0x10UL u8 internal_port_cnt; u8 unused_0; __le16 supported_speeds2_force_mode; @@ -7443,17 +7484,17 @@ struct hwrm_cfa_l2_filter_cfg_input { __le16 target_id; __le64 resp_addr; __le32 flags; - #define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH 0x1UL - #define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_TX 0x0UL - #define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_RX 0x1UL - #define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_LAST CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_RX - #define CFA_L2_FILTER_CFG_REQ_FLAGS_DROP 0x2UL - #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_MASK 0xcUL - #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_SFT 2 - #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_NO_ROCE_L2 (0x0UL << 2) - #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_L2 (0x1UL << 2) - #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_ROCE (0x2UL << 2) - #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_LAST CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_ROCE + #define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH 0x1UL + #define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_TX 0x0UL + #define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_RX 0x1UL + #define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_LAST CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_RX + #define CFA_L2_FILTER_CFG_REQ_FLAGS_DROP 0x2UL + #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_MASK 0xcUL + #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_SFT 2 + #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_NO_ROCE_L2 (0x0UL << 2) + #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_L2 (0x1UL << 2) + #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_ROCE (0x2UL << 2) + #define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_LAST CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_ROCE #define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_MASK 0x30UL #define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_SFT 4 #define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_NO_UPDATE (0x0UL << 4) @@ -8520,17 +8561,17 @@ struct hwrm_tunnel_dst_port_query_input { __le16 target_id; __le64 resp_addr; u8 tunnel_type; - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN 0x1UL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_GENEVE 0x5UL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_V4 0x9UL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_IPGRE_V1 0xaUL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_L2_ETYPE 0xbUL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_CUSTOM_GRE 0xdUL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ECPRI 0xeUL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_SRV6 0xfUL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_GPE 0x10UL - #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_GRE 0x11UL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN 0x1UL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_GENEVE 0x5UL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_V4 0x9UL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_IPGRE_V1 0xaUL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_L2_ETYPE 0xbUL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_CUSTOM_GRE 0xdUL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ECPRI 0xeUL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_SRV6 0xfUL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_GPE 0x10UL + #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_GRE 0x11UL #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR 0x12UL #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES01 0x13UL #define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES02 0x14UL @@ -8576,17 +8617,17 @@ struct hwrm_tunnel_dst_port_alloc_input { __le16 target_id; __le64 resp_addr; u8 tunnel_type; - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN 0x1UL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_GENEVE 0x5UL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4 0x9UL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_IPGRE_V1 0xaUL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_L2_ETYPE 0xbUL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_CUSTOM_GRE 0xdUL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ECPRI 0xeUL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_SRV6 0xfUL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE 0x10UL - #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_GRE 0x11UL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN 0x1UL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_GENEVE 0x5UL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4 0x9UL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_IPGRE_V1 0xaUL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_L2_ETYPE 0xbUL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_CUSTOM_GRE 0xdUL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ECPRI 0xeUL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_SRV6 0xfUL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE 0x10UL + #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_GRE 0x11UL #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR 0x12UL #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES01 0x13UL #define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES02 0x14UL @@ -8635,17 +8676,17 @@ struct hwrm_tunnel_dst_port_free_input { __le16 target_id; __le64 resp_addr; u8 tunnel_type; - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN 0x1UL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE 0x5UL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_V4 0x9UL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_IPGRE_V1 0xaUL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_L2_ETYPE 0xbUL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_CUSTOM_GRE 0xdUL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ECPRI 0xeUL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_SRV6 0xfUL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_GPE 0x10UL - #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GRE 0x11UL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN 0x1UL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE 0x5UL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_V4 0x9UL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_IPGRE_V1 0xaUL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_L2_ETYPE 0xbUL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_CUSTOM_GRE 0xdUL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ECPRI 0xeUL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_SRV6 0xfUL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_GPE 0x10UL + #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GRE 0x11UL #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR 0x12UL #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES01 0x13UL #define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES02 0x14UL @@ -9109,6 +9150,7 @@ struct hwrm_struct_hdr { #define STRUCT_HDR_STRUCT_ID_LLDP_GENERIC 0x424UL #define STRUCT_HDR_STRUCT_ID_LLDP_DEVICE 0x426UL #define STRUCT_HDR_STRUCT_ID_POWER_BKUP 0x427UL + #define STRUCT_HDR_STRUCT_ID_PEER_MMAP 0x429UL #define STRUCT_HDR_STRUCT_ID_AFM_OPAQUE 0x1UL #define STRUCT_HDR_STRUCT_ID_PORT_DESCRIPTION 0xaUL #define STRUCT_HDR_STRUCT_ID_RSS_V2 0x64UL @@ -9758,6 +9800,9 @@ struct hwrm_dbg_coredump_initiate_input { __le16 instance; __le16 unused_0; u8 seg_flags; + #define DBG_COREDUMP_INITIATE_REQ_SEG_FLAGS_LIVE_DATA 0x1UL + #define DBG_COREDUMP_INITIATE_REQ_SEG_FLAGS_CRASH_DATA 0x2UL + #define DBG_COREDUMP_INITIATE_REQ_SEG_FLAGS_COLLECT_CTX_L1_CACHE 0x4UL u8 unused_1[7]; }; @@ -10433,13 +10478,13 @@ struct hwrm_selftest_irq_output { /* dbc_dbc (size:64b/8B) */ struct dbc_dbc { - u32 index; + __le32 index; #define DBC_DBC_INDEX_MASK 0xffffffUL #define DBC_DBC_INDEX_SFT 0 #define DBC_DBC_EPOCH 0x1000000UL #define DBC_DBC_TOGGLE_MASK 0x6000000UL #define DBC_DBC_TOGGLE_SFT 25 - u32 type_path_xid; + __le32 type_path_xid; #define DBC_DBC_XID_MASK 0xfffffUL #define DBC_DBC_XID_SFT 0 #define DBC_DBC_PATH_MASK 0x3000000UL From b7bfcb4c7ce44fd0070ce8bccbc91c56341f05c1 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 18 Jun 2024 14:53:12 -0700 Subject: [PATCH 223/272] bnxt_en: Set TSO max segs on devices with limits Firmware will now advertise a non-zero TSO max segments if the device has a limit. 0 means no limit. The latest 5760X chip (early revs) has a limit of 2047 that cannot be exceeded. If exceeded, the chip will send out just a small number of segments. Call netif_set_tso_max_segs() if the device has a limit. Fixes: 2012a6abc876 ("bnxt_en: Add 5760X (P7) PCI IDs") Reviewed-by: Ajit Khaparde Reviewed-by: Somnath Kotur Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240618215313.29631-3-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 +++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c437ca1c0fd39..89d29d6d75175 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -8996,6 +8996,7 @@ static int __bnxt_hwrm_func_qcaps(struct bnxt *bp) memcpy(vf->mac_addr, resp->mac_address, ETH_ALEN); #endif } + bp->tso_max_segs = le16_to_cpu(resp->max_tso_segs); hwrm_func_qcaps_exit: hwrm_req_drop(bp, req); @@ -15363,6 +15364,8 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dev->priv_flags |= IFF_UNICAST_FLT; netif_set_tso_max_size(dev, GSO_MAX_SIZE); + if (bp->tso_max_segs) + netif_set_tso_max_segs(dev, bp->tso_max_segs); dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_RX_SG; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index bbc7edccd5a4d..9cf0acfa04e57 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2318,6 +2318,7 @@ struct bnxt { u8 rss_hash_key_updated:1; u16 max_mtu; + u16 tso_max_segs; u8 max_tc; u8 max_lltc; /* lossless TCs */ struct bnxt_queue_info q_info[BNXT_MAX_QUEUE]; From 1e7962114c10957fe4d10a15eb714578a394e90b Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Tue, 18 Jun 2024 14:53:13 -0700 Subject: [PATCH 224/272] bnxt_en: Restore PTP tx_avail count in case of skb_pad() error The current code only restores PTP tx_avail count when we get DMA mapping errors. Fix it so that the PTP tx_avail count will be restored for both DMA mapping errors and skb_pad() errors. Otherwise PTP TX timestamp will not be available after a PTP packet hits the skb_pad() error. Fixes: 83bb623c968e ("bnxt_en: Transmit and retrieve packet timestamps") Reviewed-by: Andy Gospodarek Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240618215313.29631-4-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 89d29d6d75175..a6d69a45fa014 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -732,9 +732,6 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; tx_dma_error: - if (BNXT_TX_PTP_IS_SET(lflags)) - atomic_inc(&bp->ptp_cfg->tx_avail); - last_frag = i; /* start back at beginning and unmap skb */ @@ -756,6 +753,8 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) tx_free: dev_kfree_skb_any(skb); tx_kick_pending: + if (BNXT_TX_PTP_IS_SET(lflags)) + atomic_inc(&bp->ptp_cfg->tx_avail); if (txr->kick_pending) bnxt_txr_db_kick(bp, txr, txr->tx_prod); txr->tx_buf_ring[txr->tx_prod].skb = NULL; From 48dea8f7bb011608fd969749a1980f8311ef45f2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 19 Jun 2024 08:17:48 +0200 Subject: [PATCH 225/272] selftests: virtio_net: add forgotten config options One may use tools/testing/selftests/drivers/net/virtio_net/config for example for vng build command like this one: $ vng -v -b -f tools/testing/selftests/drivers/net/virtio_net/config In that case, the needed kernel config options are not turned on. Add the missed kernel config options. Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20240617072614.75fe79e7@kernel.org/ Reported-by: Matthieu Baerts Closes: https://lore.kernel.org/netdev/1a63f209-b1d4-4809-bc30-295a5cafa296@kernel.org/ Fixes: ccfaed04db5e ("selftests: virtio_net: add initial tests") Signed-off-by: Jiri Pirko Reviewed-by: Xuan Zhuo Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20240619061748.1869404-1-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/virtio_net/config | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/virtio_net/config b/tools/testing/selftests/drivers/net/virtio_net/config index f35de0542b608..bcf7555eaffea 100644 --- a/tools/testing/selftests/drivers/net/virtio_net/config +++ b/tools/testing/selftests/drivers/net/virtio_net/config @@ -1,2 +1,8 @@ -CONFIG_VIRTIO_NET=y +CONFIG_BPF_SYSCALL=y +CONFIG_CGROUP_BPF=y +CONFIG_IPV6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_NET_L3_MASTER_DEV=y +CONFIG_NET_VRF=m CONFIG_VIRTIO_DEBUG=y +CONFIG_VIRTIO_NET=y From fba383985354e83474f95f36d7c65feb75dba19d Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Wed, 19 Jun 2024 15:28:03 +0200 Subject: [PATCH 226/272] net: usb: rtl8150 fix unintiatilzed variables in rtl8150_get_link_ksettings This functions retrieves values by passing a pointer. As the function that retrieves them can fail before touching the pointers, the variables must be initialized. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+5186630949e3c55f0799@syzkaller.appspotmail.com Signed-off-by: Oliver Neukum Link: https://lore.kernel.org/r/20240619132816.11526-1-oneukum@suse.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/rtl8150.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/rtl8150.c b/drivers/net/usb/rtl8150.c index 97afd7335d868..01a3b2417a540 100644 --- a/drivers/net/usb/rtl8150.c +++ b/drivers/net/usb/rtl8150.c @@ -778,7 +778,8 @@ static int rtl8150_get_link_ksettings(struct net_device *netdev, struct ethtool_link_ksettings *ecmd) { rtl8150_t *dev = netdev_priv(netdev); - short lpa, bmcr; + short lpa = 0; + short bmcr = 0; u32 supported; supported = (SUPPORTED_10baseT_Half | From f3ced000a2df53f4b12849e121769045a81a3b22 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 10 Jun 2024 18:48:45 -0700 Subject: [PATCH 227/272] KVM: x86: Always sync PIR to IRR prior to scanning I/O APIC routes Sync pending posted interrupts to the IRR prior to re-scanning I/O APIC routes, irrespective of whether the I/O APIC is emulated by userspace or by KVM. If a level-triggered interrupt routed through the I/O APIC is pending or in-service for a vCPU, KVM needs to intercept EOIs on said vCPU even if the vCPU isn't the destination for the new routing, e.g. if servicing an interrupt using the old routing races with I/O APIC reconfiguration. Commit fceb3a36c29a ("KVM: x86: ioapic: Fix level-triggered EOI and userspace I/OAPIC reconfigure race") fixed the common cases, but kvm_apic_pending_eoi() only checks if an interrupt is in the local APIC's IRR or ISR, i.e. misses the uncommon case where an interrupt is pending in the PIR. Failure to intercept EOI can manifest as guest hangs with Windows 11 if the guest uses the RTC as its timekeeping source, e.g. if the VMM doesn't expose a more modern form of time to the guest. Cc: stable@vger.kernel.org Cc: Adamos Ttofari Cc: Raghavendra Rao Ananta Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Message-ID: <20240611014845.82795-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8c9e4281d978d..0763a0f72a067 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10718,13 +10718,12 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256); + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); + if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); - else { - static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); - if (ioapic_in_kernel(vcpu->kvm)) - kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); - } + else if (ioapic_in_kernel(vcpu->kvm)) + kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); if (is_guest_mode(vcpu)) vcpu->arch.load_eoi_exitmap_pending = true; From b018589013d6db43fdc894c635d6590e0a7e3285 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 10 Jun 2024 09:34:27 -0700 Subject: [PATCH 228/272] MAINTAINERS: Drop Wanpeng Li as a Reviewer for KVM Paravirt support Drop Wanpeng as a KVM PARAVIRT reviewer as his @tencent.com email is bouncing, and according to lore[*], the last activity from his @gmail.com address was almost two years ago. [*] https://lore.kernel.org/all/CANRm+Cwj29M9HU3=JRUOaKDR+iDKgr0eNMWQi0iLkR5THON-bg@mail.gmail.com Cc: Wanpeng Li Cc: Like Xu Signed-off-by: Sean Christopherson Message-ID: <20240610163427.3359426-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8754ac2c259dc..5d62fe9495e6d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12383,7 +12383,6 @@ F: drivers/video/backlight/ktz8866.c KVM PARAVIRT (KVM/paravirt) M: Paolo Bonzini -R: Wanpeng Li R: Vitaly Kuznetsov L: kvm@vger.kernel.org S: Supported From f474092c6fe1e2154a35308a1a1aef3212c3ecf2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 10 Jun 2024 13:31:21 +0300 Subject: [PATCH 229/272] kvm: do not account temporary allocations to kmem Some allocations done by KVM are temporary, they are created as result of program actions, but can't exists for arbitrary long times. They should have been GFP_TEMPORARY (rip!). OTOH, kvm-nx-lpage-recovery and kvm-pit kernel threads exist for as long as VM exists but their task_struct memory is not accounted. This is story for another day. Signed-off-by: Alexey Dobriyan Message-ID: Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 14841acb8b959..8e422c2c9450f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4427,7 +4427,7 @@ static long kvm_vcpu_ioctl(struct file *filp, struct kvm_regs *kvm_regs; r = -ENOMEM; - kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT); + kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); if (!kvm_regs) goto out; r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); @@ -4454,8 +4454,7 @@ static long kvm_vcpu_ioctl(struct file *filp, break; } case KVM_GET_SREGS: { - kvm_sregs = kzalloc(sizeof(struct kvm_sregs), - GFP_KERNEL_ACCOUNT); + kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); r = -ENOMEM; if (!kvm_sregs) goto out; @@ -4547,7 +4546,7 @@ static long kvm_vcpu_ioctl(struct file *filp, break; } case KVM_GET_FPU: { - fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT); + fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); r = -ENOMEM; if (!fpu) goto out; @@ -6210,7 +6209,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) active = kvm_active_vms; mutex_unlock(&kvm_lock); - env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); + env = kzalloc(sizeof(*env), GFP_KERNEL); if (!env) return; @@ -6226,7 +6225,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) add_uevent_var(env, "PID=%d", kvm->userspace_pid); if (!IS_ERR(kvm->debugfs_dentry)) { - char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); + char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL); if (p) { tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); From ce5291e56081730ec7d87bc9aa41f3de73ff3256 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Jun 2024 20:30:00 +0100 Subject: [PATCH 230/272] cifs: Defer read completion Defer read completion from the I/O thread to the cifsiod thread so as not to slow down the I/O thread. This restores the behaviour of v6.9. Fixes: 3ee1a1fc3981 ("cifs: Cut over to using netfslib") Signed-off-by: David Howells cc: Paulo Alcantara cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 38a06e8a0f90f..e213cecd50946 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4484,6 +4484,16 @@ smb2_new_read_req(void **buf, unsigned int *total_len, return rc; } +static void smb2_readv_worker(struct work_struct *work) +{ + struct cifs_io_subrequest *rdata = + container_of(work, struct cifs_io_subrequest, subreq.work); + + netfs_subreq_terminated(&rdata->subreq, + (rdata->result == 0 || rdata->result == -EAGAIN) ? + rdata->got_bytes : rdata->result, true); +} + static void smb2_readv_callback(struct mid_q_entry *mid) { @@ -4578,9 +4588,8 @@ smb2_readv_callback(struct mid_q_entry *mid) rdata->result = 0; } rdata->credits.value = 0; - netfs_subreq_terminated(&rdata->subreq, - (rdata->result == 0 || rdata->result == -EAGAIN) ? - rdata->got_bytes : rdata->result, true); + INIT_WORK(&rdata->subreq.work, smb2_readv_worker); + queue_work(cifsiod_wq, &rdata->subreq.work); release_mid(mid); add_credits(server, &credits, 0); } From 969b3010cbfcf58de65399dff8252c41b5e79292 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Jun 2024 18:31:28 +0100 Subject: [PATCH 231/272] cifs: Only pick a channel once per read request In cifs, only pick a channel when setting up a read request rather than doing so individually for every subrequest and instead use that channel for all. This mirrors what the code in v6.9 does. Signed-off-by: David Howells cc: Steve French cc: Paulo Alcantara cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 1 + fs/smb/client/file.c | 14 +++----------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 73482734a8d8e..0978997ddfa6b 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1494,6 +1494,7 @@ struct cifs_aio_ctx { struct cifs_io_request { struct netfs_io_request rreq; struct cifsFileInfo *cfile; + struct TCP_Server_Info *server; }; /* asynchronous read support */ diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 1e269e0bc75b3..4dbd80168a2bc 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -134,17 +134,15 @@ static void cifs_issue_write(struct netfs_io_subrequest *subreq) static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; - struct TCP_Server_Info *server; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); + struct TCP_Server_Info *server = req->server; struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); size_t rsize = 0; int rc; rdata->xid = get_xid(); rdata->have_xid = true; - - server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); rdata->server = server; if (cifs_sb->ctx->rsize == 0) @@ -203,14 +201,7 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); rdata->pid = pid; - rc = adjust_credits(rdata->server, &rdata->credits, rdata->subreq.len); - if (!rc) { - if (rdata->req->cfile->invalidHandle) - rc = -EAGAIN; - else - rc = rdata->server->ops->async_readv(rdata); - } - + rc = rdata->server->ops->async_readv(rdata); out: if (rc) netfs_subreq_terminated(subreq, rc, false); @@ -250,6 +241,7 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file) open_file = file->private_data; rreq->netfs_priv = file->private_data; req->cfile = cifsFileInfo_get(open_file); + req->server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); } else if (rreq->origin != NETFS_WRITEBACK) { WARN_ON_ONCE(1); return -EIO; From 3f59138580bf8006fa99641b5803d0f683709f10 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Jun 2024 18:31:29 +0100 Subject: [PATCH 232/272] cifs: Move the 'pid' from the subreq to the req Move the reference pid from the cifs_io_subrequest struct to the cifs_io_request struct as it's the same for all subreqs of a particular request. Signed-off-by: David Howells cc: Paulo Alcantara cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 2 +- fs/smb/client/cifssmb.c | 8 ++++---- fs/smb/client/file.c | 11 +++-------- fs/smb/client/smb2pdu.c | 4 ++-- 4 files changed, 10 insertions(+), 15 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 0978997ddfa6b..557b68e99d0a0 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1495,6 +1495,7 @@ struct cifs_io_request { struct netfs_io_request rreq; struct cifsFileInfo *cfile; struct TCP_Server_Info *server; + pid_t pid; }; /* asynchronous read support */ @@ -1505,7 +1506,6 @@ struct cifs_io_subrequest { struct cifs_io_request *req; }; ssize_t got_bytes; - pid_t pid; unsigned int xid; int result; bool have_xid; diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 25e9ab947c171..595c4b673707e 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1345,8 +1345,8 @@ cifs_async_readv(struct cifs_io_subrequest *rdata) if (rc) return rc; - smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid); - smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16)); + smb->hdr.Pid = cpu_to_le16((__u16)rdata->req->pid); + smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->req->pid >> 16)); smb->AndXCommand = 0xFF; /* none */ smb->Fid = rdata->req->cfile->fid.netfid; @@ -1689,8 +1689,8 @@ cifs_async_writev(struct cifs_io_subrequest *wdata) if (rc) goto async_writev_out; - smb->hdr.Pid = cpu_to_le16((__u16)wdata->pid); - smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->pid >> 16)); + smb->hdr.Pid = cpu_to_le16((__u16)wdata->req->pid); + smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->req->pid >> 16)); smb->AndXCommand = 0xFF; /* none */ smb->Fid = wdata->req->cfile->fid.netfid; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 4dbd80168a2bc..f1f2573bb18df 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -177,15 +177,8 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq) struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); - struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); - pid_t pid; int rc = 0; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) - pid = req->cfile->pid; - else - pid = current->tgid; // Ummm... This may be a workqueue - cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n", __func__, rreq->debug_id, subreq->debug_index, rreq->mapping, subreq->transferred, subreq->len); @@ -199,7 +192,6 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq) } __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); - rdata->pid = pid; rc = rdata->server->ops->async_readv(rdata); out: @@ -236,12 +228,15 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file) rreq->rsize = cifs_sb->ctx->rsize; rreq->wsize = cifs_sb->ctx->wsize; + req->pid = current->tgid; // Ummm... This may be a workqueue if (file) { open_file = file->private_data; rreq->netfs_priv = file->private_data; req->cfile = cifsFileInfo_get(open_file); req->server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + req->pid = req->cfile->pid; } else if (rreq->origin != NETFS_WRITEBACK) { WARN_ON_ONCE(1); return -EIO; diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index e213cecd50946..2ae2dbb6202b3 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4621,7 +4621,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) io_parms.length = rdata->subreq.len; io_parms.persistent_fid = rdata->req->cfile->fid.persistent_fid; io_parms.volatile_fid = rdata->req->cfile->fid.volatile_fid; - io_parms.pid = rdata->pid; + io_parms.pid = rdata->req->pid; rc = smb2_new_read_req( (void **) &buf, &total_len, &io_parms, rdata, 0, 0); @@ -4873,7 +4873,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) .length = wdata->subreq.len, .persistent_fid = wdata->req->cfile->fid.persistent_fid, .volatile_fid = wdata->req->cfile->fid.volatile_fid, - .pid = wdata->pid, + .pid = wdata->req->pid, }; io_parms = &_io_parms; From e7c3696d4692e8046d25f6e63f983e934e12f2c5 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 15 May 2024 10:55:28 +0100 Subject: [PATCH 233/272] firmware: psci: Fix return value from psci_system_suspend() Currently we return the value from invoke_psci_fn() directly as return value from psci_system_suspend(). It is wrong to send the PSCI interface return value directly. psci_to_linux_errno() provide the mapping from PSCI return value to the one that can be returned to the callers within the kernel. Use psci_to_linux_errno() to convert and return the correct value from psci_system_suspend(). Fixes: faf7ec4a92c0 ("drivers: firmware: psci: add system suspend support") Acked-by: Mark Rutland Signed-off-by: Sudeep Holla Link: https://lore.kernel.org/r/20240515095528.1949992-1-sudeep.holla@arm.com Signed-off-by: Arnd Bergmann --- drivers/firmware/psci/psci.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index d9629ff878619..2328ca58bba61 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -497,10 +497,12 @@ int psci_cpu_suspend_enter(u32 state) static int psci_system_suspend(unsigned long unused) { + int err; phys_addr_t pa_cpu_resume = __pa_symbol(cpu_resume); - return invoke_psci_fn(PSCI_FN_NATIVE(1_0, SYSTEM_SUSPEND), + err = invoke_psci_fn(PSCI_FN_NATIVE(1_0, SYSTEM_SUSPEND), pa_cpu_resume, 0, 0); + return psci_to_linux_errno(err); } static int psci_system_suspend_enter(suspend_state_t state) From c31745d2c508796a0996c88bf2e55f552d513f65 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 11 Jun 2024 04:22:18 -0400 Subject: [PATCH 234/272] virt: guest_memfd: fix reference leak on hwpoisoned page If kvm_gmem_get_pfn() detects an hwpoisoned page, it returns -EHWPOISON but it does not put back the reference that kvm_gmem_get_folio() had grabbed. Add the forgotten folio_put(). Fixes: a7800aa80ea4 ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") Cc: stable@vger.kernel.org Reviewed-by: Liam Merwick Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini --- virt/kvm/guest_memfd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 0f4e0cf4f158b..747fe251e445b 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -510,8 +510,10 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, } if (folio_test_hwpoison(folio)) { + folio_unlock(folio); + folio_put(folio); r = -EHWPOISON; - goto out_unlock; + goto out_fput; } page = folio_file_page(folio, index); @@ -522,7 +524,6 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, r = 0; -out_unlock: folio_unlock(folio); out_fput: fput(file); From 676f819c3e982db3695a371f336a05086585ea4f Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 13 Jun 2024 20:28:03 +0800 Subject: [PATCH 235/272] KVM: Discard zero mask with function kvm_dirty_ring_reset Function kvm_reset_dirty_gfn may be called with parameters cur_slot / cur_offset / mask are all zero, it does not represent real dirty page. It is not necessary to clear dirty page in this condition. Also return value of macro __fls() is undefined if mask is zero which is called in funciton kvm_reset_dirty_gfn(). Here just return. Signed-off-by: Bibo Mao Message-ID: <20240613122803.1031511-1-maobibo@loongson.cn> [Move the conditional inside kvm_reset_dirty_gfn; suggested by Sean Christopherson. - Paolo] Signed-off-by: Paolo Bonzini --- virt/kvm/dirty_ring.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 86d267db87bb1..7bc74969a819a 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -55,6 +55,9 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) struct kvm_memory_slot *memslot; int as_id, id; + if (!mask) + return; + as_id = slot >> 16; id = (u16)slot; From d4e001ffeccfc128c715057e866f301ac9b95728 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 20 Jun 2024 13:34:49 +0200 Subject: [PATCH 236/272] dt-bindings: i2c: atmel,at91sam: correct path to i2c-controller schema The referenced i2c-controller.yaml schema is provided by dtschema package (outside of Linux kernel), so use full path to reference it. Cc: stable@vger.kernel.org Fixes: 7ea75dd386be ("dt-bindings: i2c: convert i2c-at91 to json-schema") Signed-off-by: Krzysztof Kozlowski Reviewed-by: Conor Dooley Signed-off-by: Andi Shyti --- Documentation/devicetree/bindings/i2c/atmel,at91sam-i2c.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/i2c/atmel,at91sam-i2c.yaml b/Documentation/devicetree/bindings/i2c/atmel,at91sam-i2c.yaml index b1c13bab24722..b2d19cfb87add 100644 --- a/Documentation/devicetree/bindings/i2c/atmel,at91sam-i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/atmel,at91sam-i2c.yaml @@ -77,7 +77,7 @@ required: - clocks allOf: - - $ref: i2c-controller.yaml + - $ref: /schemas/i2c/i2c-controller.yaml# - if: properties: compatible: From 5c8cfd592bb7632200b4edac8f2c7ec892ed9d81 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 20 Jun 2024 13:34:50 +0200 Subject: [PATCH 237/272] dt-bindings: i2c: google,cros-ec-i2c-tunnel: correct path to i2c-controller schema The referenced i2c-controller.yaml schema is provided by dtschema package (outside of Linux kernel), so use full path to reference it. Cc: stable@vger.kernel.org Fixes: 1acd4577a66f ("dt-bindings: i2c: convert i2c-cros-ec-tunnel to json-schema") Signed-off-by: Krzysztof Kozlowski Reviewed-by: Conor Dooley Signed-off-by: Andi Shyti --- .../devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml b/Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml index ab151c9db2191..580003cdfff59 100644 --- a/Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml +++ b/Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml @@ -21,7 +21,7 @@ description: | google,cros-ec-spi or google,cros-ec-i2c. allOf: - - $ref: i2c-controller.yaml# + - $ref: /schemas/i2c/i2c-controller.yaml# properties: compatible: From 5a72477273066b5b357801ab2d315ef14949d402 Mon Sep 17 00:00:00 2001 From: Grygorii Tertychnyi Date: Mon, 20 May 2024 17:39:32 +0200 Subject: [PATCH 238/272] i2c: ocores: set IACK bit after core is enabled Setting IACK bit when core is disabled does not clear the "Interrupt Flag" bit in the status register, and the interrupt remains pending. Sometimes it causes failure for the very first message transfer, that is usually a device probe. Hence, set IACK bit after core is enabled to clear pending interrupt. Fixes: 18f98b1e3147 ("[PATCH] i2c: New bus driver for the OpenCores I2C controller") Signed-off-by: Grygorii Tertychnyi Acked-by: Peter Korsgaard Cc: stable@vger.kernel.org Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-ocores.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c index 56a4dabf5a388..4ad670a80a63a 100644 --- a/drivers/i2c/busses/i2c-ocores.c +++ b/drivers/i2c/busses/i2c-ocores.c @@ -431,8 +431,8 @@ static int ocores_init(struct device *dev, struct ocores_i2c *i2c) oc_setreg(i2c, OCI2C_PREHIGH, prescale >> 8); /* Init the device */ - oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_IACK); oc_setreg(i2c, OCI2C_CONTROL, ctrl | OCI2C_CTRL_EN); + oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_IACK); return 0; } From 120dd4118e58dbda2ddb1dcf55f3c56cdfe8cee0 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Fri, 21 Jun 2024 10:18:40 +0800 Subject: [PATCH 239/272] LoongArch: Only allow OBJTOOL & ORC unwinder if toolchain supports -mthin-add-sub GAS <= 2.41 does not support generating R_LARCH_{32,64}_PCREL for "label - ." and it generates R_LARCH_{ADD,SUB}{32,64} pairs instead. Objtool cannot handle R_LARCH_{ADD,SUB}{32,64} pair in __jump_table (static key implementation) and etc. so it will produce some warnings. This is causing the kernel CI systems to complain everywhere. For GAS we can check if -mthin-add-sub option is available to know if R_LARCH_{32,64}_PCREL are supported. For Clang, we require Clang >= 18 and Clang >= 17 already supports R_LARCH_{32,64}_PCREL. But unfortunately Clang has some other issues, so we disable objtool for Clang at present. Note that __jump_table here is not generated by the compiler, so -fno-jump-table is completely irrelevant for this issue. Fixes: cb8a2ef0848c ("LoongArch: Add ORC stack unwinder support") Closes: https://lore.kernel.org/loongarch/Zl5m1ZlVmGKitAof@yujie-X299/ Closes: https://lore.kernel.org/loongarch/ZlY1gDDPi_mNrwJ1@slm.duckdns.org/ Closes: https://lore.kernel.org/loongarch/1717478006.038663-1-hengqi@linux.alibaba.com/ Link: https://sourceware.org/git/?p=binutils-gdb.git;a=commitdiff;h=816029e06768 Link: https://github.com/llvm/llvm-project/commit/42cb3c6346fc Signed-off-by: Xi Ruoyao Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 5 ++++- arch/loongarch/Kconfig.debug | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index e38139c576ee9..ddc042895d011 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -143,7 +143,7 @@ config LOONGARCH select HAVE_LIVEPATCH select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI - select HAVE_OBJTOOL if AS_HAS_EXPLICIT_RELOCS + select HAVE_OBJTOOL if AS_HAS_EXPLICIT_RELOCS && AS_HAS_THIN_ADD_SUB && !CC_IS_CLANG select HAVE_PCI select HAVE_PERF_EVENTS select HAVE_PERF_REGS @@ -261,6 +261,9 @@ config AS_HAS_EXPLICIT_RELOCS config AS_HAS_FCSR_CLASS def_bool $(as-instr,movfcsr2gr \$t0$(comma)\$fcsr0) +config AS_HAS_THIN_ADD_SUB + def_bool $(cc-option,-Wa$(comma)-mthin-add-sub) + config AS_HAS_LSX_EXTENSION def_bool $(as-instr,vld \$vr0$(comma)\$a0$(comma)0) diff --git a/arch/loongarch/Kconfig.debug b/arch/loongarch/Kconfig.debug index 98d60630c3d4b..8b2ce5b5d43e8 100644 --- a/arch/loongarch/Kconfig.debug +++ b/arch/loongarch/Kconfig.debug @@ -28,6 +28,7 @@ config UNWINDER_PROLOGUE config UNWINDER_ORC bool "ORC unwinder" + depends on HAVE_OBJTOOL select OBJTOOL help This option enables the ORC (Oops Rewind Capability) unwinder for From f63a47b34b140ed1ca39d7e4bd4f1cdc617fc316 Mon Sep 17 00:00:00 2001 From: Hui Li Date: Fri, 21 Jun 2024 10:18:40 +0800 Subject: [PATCH 240/272] LoongArch: Fix watchpoint setting error In the current code, when debugging the following code using gdb, "invalid argument ..." message will be displayed. lihui@bogon:~$ cat test.c #include int a = 0; int main() { a = 1; return 0; } lihui@bogon:~$ gcc -g test.c -o test lihui@bogon:~$ gdb test ... (gdb) watch a Hardware watchpoint 1: a (gdb) r ... Invalid argument setting hardware debug registers There are mainly two types of issues. 1. Some incorrect judgment condition existed in user_watch_state argument parsing, causing -EINVAL to be returned. When setting up a watchpoint, gdb uses the ptrace interface, ptrace(PTRACE_SETREGSET, tid, NT_LOONGARCH_HW_WATCH, (void *) &iov)). Register values in user_watch_state as follows: addr[0] = 0x0, mask[0] = 0x0, ctrl[0] = 0x0 addr[1] = 0x0, mask[1] = 0x0, ctrl[1] = 0x0 addr[2] = 0x0, mask[2] = 0x0, ctrl[2] = 0x0 addr[3] = 0x0, mask[3] = 0x0, ctrl[3] = 0x0 addr[4] = 0x0, mask[4] = 0x0, ctrl[4] = 0x0 addr[5] = 0x0, mask[5] = 0x0, ctrl[5] = 0x0 addr[6] = 0x0, mask[6] = 0x0, ctrl[6] = 0x0 addr[7] = 0x12000803c, mask[7] = 0x0, ctrl[7] = 0x610 In arch_bp_generic_fields(), return -EINVAL when ctrl.len is LOONGARCH_BREAKPOINT_LEN_8(0b00). So delete the incorrect judgment here. In ptrace_hbp_fill_attr_ctrl(), when note_type is NT_LOONGARCH_HW_WATCH and ctrl[0] == 0x0, if ((type & HW_BREAKPOINT_RW) != type) will return -EINVAL. Here ctrl.type should be set based on note_type, and unnecessary judgments can be removed. 2. The watchpoint argument was not set correctly due to unnecessary offset and alignment_mask. Modify ptrace_hbp_fill_attr_ctrl() and hw_breakpoint_arch_parse(), which ensure the watchpont argument is set correctly. All changes according to the LoongArch Reference Manual: https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#control-and-status-registers-related-to-watchpoints Cc: stable@vger.kernel.org Signed-off-by: Hui Li Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/hw_breakpoint.h | 2 +- arch/loongarch/kernel/hw_breakpoint.c | 19 ++++--------- arch/loongarch/kernel/ptrace.c | 32 ++++++++++------------ 3 files changed, 21 insertions(+), 32 deletions(-) diff --git a/arch/loongarch/include/asm/hw_breakpoint.h b/arch/loongarch/include/asm/hw_breakpoint.h index 21447fb1efc77..a8ce580f4fc6f 100644 --- a/arch/loongarch/include/asm/hw_breakpoint.h +++ b/arch/loongarch/include/asm/hw_breakpoint.h @@ -101,7 +101,7 @@ struct perf_event; struct perf_event_attr; extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, - int *gen_len, int *gen_type, int *offset); + int *gen_len, int *gen_type); extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); extern int hw_breakpoint_arch_parse(struct perf_event *bp, const struct perf_event_attr *attr, diff --git a/arch/loongarch/kernel/hw_breakpoint.c b/arch/loongarch/kernel/hw_breakpoint.c index fc55c4de2a11f..950b2b8a82ee0 100644 --- a/arch/loongarch/kernel/hw_breakpoint.c +++ b/arch/loongarch/kernel/hw_breakpoint.c @@ -283,7 +283,7 @@ int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) * to generic breakpoint descriptions. */ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, - int *gen_len, int *gen_type, int *offset) + int *gen_len, int *gen_type) { /* Type */ switch (ctrl.type) { @@ -303,11 +303,6 @@ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, return -EINVAL; } - if (!ctrl.len) - return -EINVAL; - - *offset = __ffs(ctrl.len); - /* Len */ switch (ctrl.len) { case LOONGARCH_BREAKPOINT_LEN_1: @@ -386,21 +381,17 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, struct arch_hw_breakpoint *hw) { int ret; - u64 alignment_mask, offset; + u64 alignment_mask; /* Build the arch_hw_breakpoint. */ ret = arch_build_bp_info(bp, attr, hw); if (ret) return ret; - if (hw->ctrl.type != LOONGARCH_BREAKPOINT_EXECUTE) - alignment_mask = 0x7; - else + if (hw->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) { alignment_mask = 0x3; - offset = hw->address & alignment_mask; - - hw->address &= ~alignment_mask; - hw->ctrl.len <<= offset; + hw->address &= ~alignment_mask; + } return 0; } diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c index c114c5ef13325..16b756c6049bc 100644 --- a/arch/loongarch/kernel/ptrace.c +++ b/arch/loongarch/kernel/ptrace.c @@ -494,28 +494,14 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, struct arch_hw_breakpoint_ctrl ctrl, struct perf_event_attr *attr) { - int err, len, type, offset; + int err, len, type; - err = arch_bp_generic_fields(ctrl, &len, &type, &offset); + err = arch_bp_generic_fields(ctrl, &len, &type); if (err) return err; - switch (note_type) { - case NT_LOONGARCH_HW_BREAK: - if ((type & HW_BREAKPOINT_X) != type) - return -EINVAL; - break; - case NT_LOONGARCH_HW_WATCH: - if ((type & HW_BREAKPOINT_RW) != type) - return -EINVAL; - break; - default: - return -EINVAL; - } - attr->bp_len = len; attr->bp_type = type; - attr->bp_addr += offset; return 0; } @@ -609,7 +595,19 @@ static int ptrace_hbp_set_ctrl(unsigned int note_type, return PTR_ERR(bp); attr = bp->attr; - decode_ctrl_reg(uctrl, &ctrl); + + switch (note_type) { + case NT_LOONGARCH_HW_BREAK: + ctrl.type = LOONGARCH_BREAKPOINT_EXECUTE; + ctrl.len = LOONGARCH_BREAKPOINT_LEN_4; + break; + case NT_LOONGARCH_HW_WATCH: + decode_ctrl_reg(uctrl, &ctrl); + break; + default: + return -EINVAL; + } + err = ptrace_hbp_fill_attr_ctrl(note_type, ctrl, &attr); if (err) return err; From c8e57ab0995c5b443d3c81c8a36b588776dcd0c3 Mon Sep 17 00:00:00 2001 From: Hui Li Date: Fri, 21 Jun 2024 10:18:40 +0800 Subject: [PATCH 241/272] LoongArch: Trigger user-space watchpoints correctly In the current code, gdb can set the watchpoint successfully through ptrace interface, but watchpoint will not be triggered. When debugging the following code using gdb. lihui@bogon:~$ cat test.c #include int a = 0; int main() { a = 1; printf("a = %d\n", a); return 0; } lihui@bogon:~$ gcc -g test.c -o test lihui@bogon:~$ gdb test ... (gdb) watch a ... (gdb) r ... a = 1 [Inferior 1 (process 4650) exited normally] No watchpoints were triggered, the root causes are: 1. Kernel uses perf_event and hw_breakpoint framework to control watchpoint, but the perf_event corresponding to watchpoint is not enabled. So it needs to be enabled according to MWPnCFG3 or FWPnCFG3 PLV bit field in ptrace_hbp_set_ctrl(), and privilege is set according to the monitored addr in hw_breakpoint_control(). Furthermore, add a judgment in ptrace_hbp_set_addr() to ensure kernel-space addr cannot be monitored in user mode. 2. The global enable control for all watchpoints is the WE bit of CSR.CRMD, and hardware sets the value to 0 when an exception is triggered. When the ERTN instruction is executed to return, the hardware restores the value of the PWE field of CSR.PRMD here. So, before a thread containing watchpoints be scheduled, the PWE field of CSR.PRMD needs to be set to 1. Add this modification in hw_breakpoint_control(). All changes according to the LoongArch Reference Manual: https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#control-and-status-registers-related-to-watchpoints https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#basic-control-and-status-registers With this patch: lihui@bogon:~$ gdb test ... (gdb) watch a Hardware watchpoint 1: a (gdb) r ... Hardware watchpoint 1: a Old value = 0 New value = 1 main () at test.c:6 6 printf("a = %d\n", a); (gdb) c Continuing. a = 1 [Inferior 1 (process 775) exited normally] Cc: stable@vger.kernel.org Signed-off-by: Hui Li Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/hw_breakpoint.h | 2 ++ arch/loongarch/kernel/hw_breakpoint.c | 20 +++++++++++++++++--- arch/loongarch/kernel/ptrace.c | 15 ++++++++++++--- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/arch/loongarch/include/asm/hw_breakpoint.h b/arch/loongarch/include/asm/hw_breakpoint.h index a8ce580f4fc6f..d78330916bd18 100644 --- a/arch/loongarch/include/asm/hw_breakpoint.h +++ b/arch/loongarch/include/asm/hw_breakpoint.h @@ -75,6 +75,8 @@ do { \ #define CSR_MWPC_NUM 0x3f #define CTRL_PLV_ENABLE 0x1e +#define CTRL_PLV0_ENABLE 0x02 +#define CTRL_PLV3_ENABLE 0x10 #define MWPnCFG3_LoadEn 8 #define MWPnCFG3_StoreEn 9 diff --git a/arch/loongarch/kernel/hw_breakpoint.c b/arch/loongarch/kernel/hw_breakpoint.c index 950b2b8a82ee0..e882df1f72db8 100644 --- a/arch/loongarch/kernel/hw_breakpoint.c +++ b/arch/loongarch/kernel/hw_breakpoint.c @@ -174,11 +174,21 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) static int hw_breakpoint_control(struct perf_event *bp, enum hw_breakpoint_ops ops) { - u32 ctrl; + u32 ctrl, privilege; int i, max_slots, enable; + struct pt_regs *regs; struct perf_event **slots; struct arch_hw_breakpoint *info = counter_arch_bp(bp); + if (arch_check_bp_in_kernelspace(info)) + privilege = CTRL_PLV0_ENABLE; + else + privilege = CTRL_PLV3_ENABLE; + + /* Whether bp belongs to a task. */ + if (bp->hw.target) + regs = task_pt_regs(bp->hw.target); + if (info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) { /* Breakpoint */ slots = this_cpu_ptr(bp_on_reg); @@ -204,13 +214,15 @@ static int hw_breakpoint_control(struct perf_event *bp, write_wb_reg(CSR_CFG_ASID, i, 0, 0); write_wb_reg(CSR_CFG_ASID, i, 1, 0); if (info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) { - write_wb_reg(CSR_CFG_CTRL, i, 0, CTRL_PLV_ENABLE); + write_wb_reg(CSR_CFG_CTRL, i, 0, privilege); } else { ctrl = encode_ctrl_reg(info->ctrl); - write_wb_reg(CSR_CFG_CTRL, i, 1, ctrl | CTRL_PLV_ENABLE); + write_wb_reg(CSR_CFG_CTRL, i, 1, ctrl | privilege); } enable = csr_read64(LOONGARCH_CSR_CRMD); csr_write64(CSR_CRMD_WE | enable, LOONGARCH_CSR_CRMD); + if (bp->hw.target) + regs->csr_prmd |= CSR_PRMD_PWE; break; case HW_BREAKPOINT_UNINSTALL: /* Reset the FWPnCFG/MWPnCFG 1~4 register. */ @@ -222,6 +234,8 @@ static int hw_breakpoint_control(struct perf_event *bp, write_wb_reg(CSR_CFG_CTRL, i, 1, 0); write_wb_reg(CSR_CFG_ASID, i, 0, 0); write_wb_reg(CSR_CFG_ASID, i, 1, 0); + if (bp->hw.target) + regs->csr_prmd &= ~CSR_PRMD_PWE; break; } diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c index 16b756c6049bc..200109de1971a 100644 --- a/arch/loongarch/kernel/ptrace.c +++ b/arch/loongarch/kernel/ptrace.c @@ -608,9 +608,14 @@ static int ptrace_hbp_set_ctrl(unsigned int note_type, return -EINVAL; } - err = ptrace_hbp_fill_attr_ctrl(note_type, ctrl, &attr); - if (err) - return err; + if (uctrl & CTRL_PLV_ENABLE) { + err = ptrace_hbp_fill_attr_ctrl(note_type, ctrl, &attr); + if (err) + return err; + attr.disabled = 0; + } else { + attr.disabled = 1; + } return modify_user_hw_breakpoint(bp, &attr); } @@ -641,6 +646,10 @@ static int ptrace_hbp_set_addr(unsigned int note_type, struct perf_event *bp; struct perf_event_attr attr; + /* Kernel-space address cannot be monitored by user-space */ + if ((unsigned long)addr >= XKPRANGE) + return -EINVAL; + bp = ptrace_hbp_get_initialised_bp(note_type, tsk, idx); if (IS_ERR(bp)) return PTR_ERR(bp); From 3eb2a8b23598e90fda43abb0f23cb267bd5018ba Mon Sep 17 00:00:00 2001 From: Hui Li Date: Fri, 21 Jun 2024 10:18:40 +0800 Subject: [PATCH 242/272] LoongArch: Fix multiple hardware watchpoint issues In the current code, if multiple hardware breakpoints/watchpoints in a user-space thread, some of them will not be triggered. When debugging the following code using gdb. lihui@bogon:~$ cat test.c #include int a = 0; int main() { printf("start test\n"); a = 1; printf("a = %d\n", a); printf("end test\n"); return 0; } lihui@bogon:~$ gcc -g test.c -o test lihui@bogon:~$ gdb test ... (gdb) start ... Temporary breakpoint 1, main () at test.c:5 5 printf("start test\n"); (gdb) watch a Hardware watchpoint 2: a (gdb) hbreak 8 Hardware assisted breakpoint 3 at 0x1200006ec: file test.c, line 8. (gdb) c Continuing. start test a = 1 Breakpoint 3, main () at test.c:8 8 printf("end test\n"); ... The first hardware watchpoint is not triggered, the root causes are: 1. In hw_breakpoint_control(), The FWPnCFG1.2.4/MWPnCFG1.2.4 register settings are not distinguished. They should be set based on hardware watchpoint functions (fetch or load/store operations). 2. In breakpoint_handler() and watchpoint_handler(), it doesn't identify which watchpoint is triggered. So, all watchpoint-related perf_event callbacks are called and siginfo is sent to the user space. This will cause user-space unable to determine which watchpoint is triggered. The kernel need to identity which watchpoint is triggered via MWPS/ FWPS registers, and then call the corresponding perf event callbacks to report siginfo to the user-space. Modify the relevant code to solve above issues. All changes according to the LoongArch Reference Manual: https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#control-and-status-registers-related-to-watchpoints With this patch: lihui@bogon:~$ gdb test ... (gdb) start ... Temporary breakpoint 1, main () at test.c:5 5 printf("start test\n"); (gdb) watch a Hardware watchpoint 2: a (gdb) hbreak 8 Hardware assisted breakpoint 3 at 0x1200006ec: file test.c, line 8. (gdb) c Continuing. start test Hardware watchpoint 2: a Old value = 0 New value = 1 main () at test.c:7 7 printf("a = %d\n", a); (gdb) c Continuing. a = 1 Breakpoint 3, main () at test.c:8 8 printf("end test\n"); (gdb) c Continuing. end test [Inferior 1 (process 778) exited normally] Cc: stable@vger.kernel.org Signed-off-by: Hui Li Signed-off-by: Huacai Chen --- arch/loongarch/kernel/hw_breakpoint.c | 57 ++++++++++++++++----------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/arch/loongarch/kernel/hw_breakpoint.c b/arch/loongarch/kernel/hw_breakpoint.c index e882df1f72db8..621ad7634df71 100644 --- a/arch/loongarch/kernel/hw_breakpoint.c +++ b/arch/loongarch/kernel/hw_breakpoint.c @@ -207,15 +207,15 @@ static int hw_breakpoint_control(struct perf_event *bp, switch (ops) { case HW_BREAKPOINT_INSTALL: /* Set the FWPnCFG/MWPnCFG 1~4 register. */ - write_wb_reg(CSR_CFG_ADDR, i, 0, info->address); - write_wb_reg(CSR_CFG_ADDR, i, 1, info->address); - write_wb_reg(CSR_CFG_MASK, i, 0, info->mask); - write_wb_reg(CSR_CFG_MASK, i, 1, info->mask); - write_wb_reg(CSR_CFG_ASID, i, 0, 0); - write_wb_reg(CSR_CFG_ASID, i, 1, 0); if (info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) { + write_wb_reg(CSR_CFG_ADDR, i, 0, info->address); + write_wb_reg(CSR_CFG_MASK, i, 0, info->mask); + write_wb_reg(CSR_CFG_ASID, i, 0, 0); write_wb_reg(CSR_CFG_CTRL, i, 0, privilege); } else { + write_wb_reg(CSR_CFG_ADDR, i, 1, info->address); + write_wb_reg(CSR_CFG_MASK, i, 1, info->mask); + write_wb_reg(CSR_CFG_ASID, i, 1, 0); ctrl = encode_ctrl_reg(info->ctrl); write_wb_reg(CSR_CFG_CTRL, i, 1, ctrl | privilege); } @@ -226,14 +226,17 @@ static int hw_breakpoint_control(struct perf_event *bp, break; case HW_BREAKPOINT_UNINSTALL: /* Reset the FWPnCFG/MWPnCFG 1~4 register. */ - write_wb_reg(CSR_CFG_ADDR, i, 0, 0); - write_wb_reg(CSR_CFG_ADDR, i, 1, 0); - write_wb_reg(CSR_CFG_MASK, i, 0, 0); - write_wb_reg(CSR_CFG_MASK, i, 1, 0); - write_wb_reg(CSR_CFG_CTRL, i, 0, 0); - write_wb_reg(CSR_CFG_CTRL, i, 1, 0); - write_wb_reg(CSR_CFG_ASID, i, 0, 0); - write_wb_reg(CSR_CFG_ASID, i, 1, 0); + if (info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) { + write_wb_reg(CSR_CFG_ADDR, i, 0, 0); + write_wb_reg(CSR_CFG_MASK, i, 0, 0); + write_wb_reg(CSR_CFG_CTRL, i, 0, 0); + write_wb_reg(CSR_CFG_ASID, i, 0, 0); + } else { + write_wb_reg(CSR_CFG_ADDR, i, 1, 0); + write_wb_reg(CSR_CFG_MASK, i, 1, 0); + write_wb_reg(CSR_CFG_CTRL, i, 1, 0); + write_wb_reg(CSR_CFG_ASID, i, 1, 0); + } if (bp->hw.target) regs->csr_prmd &= ~CSR_PRMD_PWE; break; @@ -476,12 +479,15 @@ void breakpoint_handler(struct pt_regs *regs) slots = this_cpu_ptr(bp_on_reg); for (i = 0; i < boot_cpu_data.watch_ireg_count; ++i) { - bp = slots[i]; - if (bp == NULL) - continue; - perf_bp_event(bp, regs); + if ((csr_read32(LOONGARCH_CSR_FWPS) & (0x1 << i))) { + bp = slots[i]; + if (bp == NULL) + continue; + perf_bp_event(bp, regs); + csr_write32(0x1 << i, LOONGARCH_CSR_FWPS); + update_bp_registers(regs, 0, 0); + } } - update_bp_registers(regs, 0, 0); } NOKPROBE_SYMBOL(breakpoint_handler); @@ -493,12 +499,15 @@ void watchpoint_handler(struct pt_regs *regs) slots = this_cpu_ptr(wp_on_reg); for (i = 0; i < boot_cpu_data.watch_dreg_count; ++i) { - wp = slots[i]; - if (wp == NULL) - continue; - perf_bp_event(wp, regs); + if ((csr_read32(LOONGARCH_CSR_MWPS) & (0x1 << i))) { + wp = slots[i]; + if (wp == NULL) + continue; + perf_bp_event(wp, regs); + csr_write32(0x1 << i, LOONGARCH_CSR_MWPS); + update_bp_registers(regs, 0, 1); + } } - update_bp_registers(regs, 0, 1); } NOKPROBE_SYMBOL(watchpoint_handler); From d0a1c07739e1b7f74683fe061545669156d102f2 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 21 Jun 2024 10:18:40 +0800 Subject: [PATCH 243/272] LoongArch: KVM: Remove an unneeded semicolon Remove an unneeded semicolon to avoid build warnings: ./arch/loongarch/kvm/exit.c:764:2-3: Unneeded semicolon Cc: stable@vger.kernel.org Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=9343 Signed-off-by: Yang Li Signed-off-by: Huacai Chen --- arch/loongarch/kvm/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index c86e099af5cad..a68573e091c01 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -761,7 +761,7 @@ static void kvm_handle_service(struct kvm_vcpu *vcpu) default: ret = KVM_HCALL_INVALID_CODE; break; - }; + } kvm_write_reg(vcpu, LOONGARCH_GPR_A0, ret); } From ad22051afdad962b6012f3823d0ed1a735935386 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ca=C3=B1o?= Date: Thu, 20 Jun 2024 17:25:33 +0200 Subject: [PATCH 244/272] ALSA: hda/realtek: Add quirk for Lenovo Yoga Pro 7 14AHP9 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lenovo Yoga Pro 7 14AHP9 (PCI SSID 17aa:3891) seems requiring a similar workaround like Yoga 9 model and Yoga 7 Pro 14APH8 for the bass speaker. Cc: Link: https://lore.kernel.org/all/20231207182035.30248-1-tiwai@suse.de/ Signed-off-by: Pablo Caño Link: https://patch.msgid.link/20240620152533.76712-1-pablocpascual@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index e2dbcf8f5bcfb..f4454abadc8d9 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10555,6 +10555,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3882, "Lenovo Yoga Pro 7 14APH8", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), SND_PCI_QUIRK(0x17aa, 0x3884, "Y780 YG DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x3886, "Y780 VECO DUAL", ALC287_FIXUP_TAS2781_I2C), + SND_PCI_QUIRK(0x17aa, 0x3891, "Lenovo Yoga Pro 7 14AHP9", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), SND_PCI_QUIRK(0x17aa, 0x38a7, "Y780P AMD YG dual", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38a8, "Y780P AMD VECO dual", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38a9, "Thinkbook 16P", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), From 4a3e37b3caea817199757a0b13aa53dd7c9376c8 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Sun, 16 Jun 2024 14:25:02 +0100 Subject: [PATCH 245/272] MIPS: mipsmtregs: Fix target register for MFTC0 Target register of mftc0 should be __res instead of $1, this is a leftover from old .insn code. Fixes: dd6d29a61489 ("MIPS: Implement microMIPS MT ASE helpers") Cc: stable@vger.kernel.org Signed-off-by: Jiaxun Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/mipsmtregs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/include/asm/mipsmtregs.h b/arch/mips/include/asm/mipsmtregs.h index 30e86861c206c..b1ee3c48e84ba 100644 --- a/arch/mips/include/asm/mipsmtregs.h +++ b/arch/mips/include/asm/mipsmtregs.h @@ -322,7 +322,7 @@ static inline void ehb(void) " .set push \n" \ " .set "MIPS_ISA_LEVEL" \n" \ _ASM_SET_MFTC0 \ - " mftc0 $1, " #rt ", " #sel " \n" \ + " mftc0 %0, " #rt ", " #sel " \n" \ _ASM_UNSET_MFTC0 \ " .set pop \n" \ : "=r" (__res)); \ From 0d5679a0aae2d8cda72169452c32e5cb88a7ab33 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Jun 2024 18:23:04 +0200 Subject: [PATCH 246/272] mips: fix compat_sys_lseek syscall This is almost compatible, but passing a negative offset should result in a EINVAL error, but on mips o32 compat mode would seek to a large 32-bit byte offset. Use compat_sys_lseek() to correctly sign-extend the argument. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/syscalls/syscall_o32.tbl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 008ebe60263e3..81428a2eb6604 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -27,7 +27,7 @@ 17 o32 break sys_ni_syscall # 18 was sys_stat 18 o32 unused18 sys_ni_syscall -19 o32 lseek sys_lseek +19 o32 lseek sys_lseek compat_sys_lseek 20 o32 getpid sys_getpid 21 o32 mount sys_mount 22 o32 umount sys_oldumount From 17563b4a19d1844bdbccc7a82d2f31c28ca9cfae Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 21 Jun 2024 09:39:09 +0200 Subject: [PATCH 247/272] ALSA: hda: Use imply for suggesting CONFIG_SERIAL_MULTI_INSTANTIATE The recent fix introduced a reverse selection of CONFIG_SERIAL_MULTI_INSTANTIATE, but its condition isn't always met. Use a weak reverse selection to suggest the config for avoiding such inconsistencies, instead. Fixes: 9b1effff19cd ("ALSA: hda: cs35l56: Select SERIAL_MULTI_INSTANTIATE") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202406210732.ozgk8IMK-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202406211244.oLhoF3My-lkp@intel.com/ Reviewed-by: Richard Fitzgerald Link: https://patch.msgid.link/20240621073915.19576-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/Kconfig b/sound/pci/hda/Kconfig index e59df40a0007a..a3cf0725fc43b 100644 --- a/sound/pci/hda/Kconfig +++ b/sound/pci/hda/Kconfig @@ -162,7 +162,7 @@ config SND_HDA_SCODEC_CS35L56_I2C depends on ACPI || COMPILE_TEST depends on SND_SOC select FW_CS_DSP - select SERIAL_MULTI_INSTANTIATE + imply SERIAL_MULTI_INSTANTIATE select SND_HDA_GENERIC select SND_SOC_CS35L56_SHARED select SND_HDA_SCODEC_CS35L56 @@ -179,7 +179,7 @@ config SND_HDA_SCODEC_CS35L56_SPI depends on ACPI || COMPILE_TEST depends on SND_SOC select FW_CS_DSP - select SERIAL_MULTI_INSTANTIATE + imply SERIAL_MULTI_INSTANTIATE select SND_HDA_GENERIC select SND_SOC_CS35L56_SHARED select SND_HDA_SCODEC_CS35L56 From cf6d9d2d243f242f51ee0666ca88e61d9408752f Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Tue, 4 Jun 2024 18:35:10 -0500 Subject: [PATCH 248/272] KVM: SEV-ES: Fix svm_get_msr()/svm_set_msr() for KVM_SEV_ES_INIT guests With commit 27bd5fdc24c0 ("KVM: SEV-ES: Prevent MSR access post VMSA encryption"), older VMMs like QEMU 9.0 and older will fail when booting SEV-ES guests with something like the following error: qemu-system-x86_64: error: failed to get MSR 0x174 qemu-system-x86_64: ../qemu.git/target/i386/kvm/kvm.c:3950: kvm_get_msrs: Assertion `ret == cpu->kvm_msr_buf->nmsrs' failed. This is because older VMMs that might still call svm_get_msr()/svm_set_msr() for SEV-ES guests after guest boot even if those interfaces were essentially just noops because of the vCPU state being encrypted and stored separately in the VMSA. Now those VMMs will get an -EINVAL and generally crash. Newer VMMs that are aware of KVM_SEV_INIT2 however are already aware of the stricter limitations of what vCPU state can be sync'd during guest run-time, so newer QEMU for instance will work both for legacy KVM_SEV_ES_INIT interface as well as KVM_SEV_INIT2. So when using KVM_SEV_INIT2 it's okay to assume userspace can deal with -EINVAL, whereas for legacy KVM_SEV_ES_INIT the kernel might be dealing with either an older VMM and so it needs to assume that returning -EINVAL might break the VMM. Address this by only returning -EINVAL if the guest was started with KVM_SEV_INIT2. Otherwise, just silently return. Cc: Ravi Bangoria Cc: Nikunj A Dadhania Reported-by: Srikanth Aithal Closes: https://lore.kernel.org/lkml/37usuu4yu4ok7be2hqexhmcyopluuiqj3k266z4gajc2rcj4yo@eujb23qc3zcm/ Fixes: 27bd5fdc24c0 ("KVM: SEV-ES: Prevent MSR access post VMSA encryption") Signed-off-by: Michael Roth Message-ID: <20240604233510.764949-1-michael.roth@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 296c524988f95..c95d3900fe564 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2843,7 +2843,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (sev_es_prevent_msr_access(vcpu, msr_info)) { msr_info->data = 0; - return -EINVAL; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; } switch (msr_info->index) { @@ -2998,7 +2998,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u64 data = msr->data; if (sev_es_prevent_msr_access(vcpu, msr)) - return -EINVAL; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; switch (ecx) { case MSR_AMD64_TSC_RATIO: From 1cbf347288702af0fe8667c0ce760afbe982a2f1 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 14 Jun 2024 11:14:18 +0300 Subject: [PATCH 249/272] i2c: Add nop fwnode operations Add nop variants of i2c_find_device_by_fwnode(), i2c_find_adapter_by_fwnode() and i2c_get_adapter_by_fwnode() for use without CONFIG_I2C. Signed-off-by: Sakari Ailus Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 9709537370ee9..424acb98c7c26 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -960,8 +960,6 @@ int i2c_handle_smbus_host_notify(struct i2c_adapter *adap, unsigned short addr); #define builtin_i2c_driver(__i2c_driver) \ builtin_driver(__i2c_driver, i2c_add_driver) -#endif /* I2C */ - /* must call put_device() when done with returned i2c_client device */ struct i2c_client *i2c_find_device_by_fwnode(struct fwnode_handle *fwnode); @@ -971,6 +969,28 @@ struct i2c_adapter *i2c_find_adapter_by_fwnode(struct fwnode_handle *fwnode); /* must call i2c_put_adapter() when done with returned i2c_adapter device */ struct i2c_adapter *i2c_get_adapter_by_fwnode(struct fwnode_handle *fwnode); +#else /* I2C */ + +static inline struct i2c_client * +i2c_find_device_by_fwnode(struct fwnode_handle *fwnode) +{ + return NULL; +} + +static inline struct i2c_adapter * +i2c_find_adapter_by_fwnode(struct fwnode_handle *fwnode) +{ + return NULL; +} + +static inline struct i2c_adapter * +i2c_get_adapter_by_fwnode(struct fwnode_handle *fwnode) +{ + return NULL; +} + +#endif /* !I2C */ + #if IS_ENABLED(CONFIG_OF) /* must call put_device() when done with returned i2c_client device */ static inline struct i2c_client *of_find_i2c_device_by_node(struct device_node *node) From c1eb2512596fb3542357bb6c34c286f5e0374538 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 May 2024 15:52:52 +0300 Subject: [PATCH 250/272] RDMA/mlx5: Remove extra unlock on error path The below commit lifted the locking out of this function but left this error path unlock behind resulting in unbalanced locking. Remove the missed unlock too. Cc: stable@vger.kernel.org Fixes: 627122280c87 ("RDMA/mlx5: Add work to remove temporary entries from the cache") Signed-off-by: Jason Gunthorpe Reviewed-by: Michael Guralnik Link: https://lore.kernel.org/r/78090c210c750f47219b95248f9f782f34548bb1.1716900410.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index ecc111ed5d86e..38d2c743db877 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -641,10 +641,8 @@ static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, new = &((*new)->rb_left); if (cmp < 0) new = &((*new)->rb_right); - if (cmp == 0) { - mutex_unlock(&cache->rb_lock); + if (cmp == 0) return -EEXIST; - } } /* Add new node and rebalance tree. */ From f637040c3339a2ed8c12d65ad03f9552386e2fe7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 May 2024 15:52:53 +0300 Subject: [PATCH 251/272] RDMA/mlx5: Follow rb_key.ats when creating new mkeys When a cache ent already exists but doesn't have any mkeys in it the cache will automatically create a new one based on the specification in the ent->rb_key. ent->ats was missed when creating the new key and so ma_translation_mode was not being set even though the ent requires it. Cc: stable@vger.kernel.org Fixes: 73d09b2fe833 ("RDMA/mlx5: Introduce mlx5r_cache_rb_key") Signed-off-by: Jason Gunthorpe Reviewed-by: Michael Guralnik Link: https://lore.kernel.org/r/7c5613458ecb89fbe5606b7aa4c8d990bdea5b9a.1716900410.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 38d2c743db877..35dcb9d9e12af 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -246,6 +246,7 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3); MLX5_SET(mkc, mkc, access_mode_4_2, (ent->rb_key.access_mode >> 2) & 0x7); + MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats); MLX5_SET(mkc, mkc, translations_octword_size, get_mkc_octo_size(ent->rb_key.access_mode, From 2e4c02fdecf2f6f55cefe48cb82d93fa4f8e2204 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 May 2024 15:52:54 +0300 Subject: [PATCH 252/272] RDMA/mlx5: Ensure created mkeys always have a populated rb_key cachable and mmkey.rb_key together are used by mlx5_revoke_mr() to put the MR/mkey back into the cache. In all cases they should be set correctly. alloc_cacheable_mr() was setting cachable but not filling rb_key, resulting in cache_ent_find_and_store() bucketing them all into a 0 length entry. implicit_get_child_mr()/mlx5_ib_alloc_implicit_mr() failed to set cachable or rb_key at all, so the cache was not working at all for implicit ODP. Cc: stable@vger.kernel.org Fixes: 8c1185fef68c ("RDMA/mlx5: Change check for cacheable mkeys") Fixes: dd1b913fb0d0 ("RDMA/mlx5: Cache all user cacheable mkeys on dereg MR flow") Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/7778c02dfa0999a30d6746c79a23dd7140a9c729.1716900410.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 35dcb9d9e12af..d3c1f63791a2b 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -718,6 +718,8 @@ static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, } mr->mmkey.cache_ent = ent; mr->mmkey.type = MLX5_MKEY_MR; + mr->mmkey.rb_key = ent->rb_key; + mr->mmkey.cacheable = true; init_waitqueue_head(&mr->mmkey.wait); return mr; } @@ -1168,7 +1170,6 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, mr->ibmr.pd = pd; mr->umem = umem; mr->page_shift = order_base_2(page_size); - mr->mmkey.cacheable = true; set_mr_fields(dev, mr, umem->length, access_flags, iova); return mr; From 81497c148b7a2e4a4fbda93aee585439f7323e2e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 28 May 2024 15:52:55 +0300 Subject: [PATCH 253/272] RDMA/mlx5: Fix unwind flow as part of mlx5_ib_stage_init_init Fix unwind flow as part of mlx5_ib_stage_init_init to use the correct goto upon an error. Fixes: 758ce14aee82 ("RDMA/mlx5: Implement MACsec gid addition and deletion") Signed-off-by: Yishai Hadas Reviewed-by: Patrisious Haddad Link: https://lore.kernel.org/r/aa40615116eda14ec9eca21d52017d632ea89188.1716900410.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 2366c46eebc87..43660c831b22c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3759,10 +3759,10 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) spin_lock_init(&dev->dm.lock); dev->dm.dev = mdev; return 0; -err: - mlx5r_macsec_dealloc_gids(dev); err_mp: mlx5_ib_cleanup_multiport_master(dev); +err: + mlx5r_macsec_dealloc_gids(dev); return err; } From 36ab7ada64caf08f10ee5a114d39964d1f91e81d Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Tue, 28 May 2024 15:52:56 +0300 Subject: [PATCH 254/272] RDMA/mlx5: Add check for srq max_sge attribute max_sge attribute is passed by the user, and is inserted and used unchecked, so verify that the value doesn't exceed maximum allowed value before using it. Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Patrisious Haddad Link: https://lore.kernel.org/r/277ccc29e8d57bfd53ddeb2ac633f2760cf8cdd0.1716900410.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/srq.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index a056ea835da54..84be0c3d56995 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -199,17 +199,20 @@ int mlx5_ib_create_srq(struct ib_srq *ib_srq, int err; struct mlx5_srq_attr in = {}; __u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); + __u32 max_sge_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq) / + sizeof(struct mlx5_wqe_data_seg); if (init_attr->srq_type != IB_SRQT_BASIC && init_attr->srq_type != IB_SRQT_XRC && init_attr->srq_type != IB_SRQT_TM) return -EOPNOTSUPP; - /* Sanity check SRQ size before proceeding */ - if (init_attr->attr.max_wr >= max_srq_wqes) { - mlx5_ib_dbg(dev, "max_wr %d, cap %d\n", - init_attr->attr.max_wr, - max_srq_wqes); + /* Sanity check SRQ and sge size before proceeding */ + if (init_attr->attr.max_wr >= max_srq_wqes || + init_attr->attr.max_sge > max_sge_sz) { + mlx5_ib_dbg(dev, "max_wr %d,wr_cap %d,max_sge %d, sge_cap:%d\n", + init_attr->attr.max_wr, max_srq_wqes, + init_attr->attr.max_sge, max_sge_sz); return -EINVAL; } From 82a5cc783d49b86afd2f60e297ecd85223c39f88 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Wed, 5 Jun 2024 01:16:08 -0700 Subject: [PATCH 255/272] RDMA/mana_ib: Ignore optional access flags for MRs Ignore optional ib_access_flags when an MR is created. Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter") Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1717575368-14879-1-git-send-email-kotaranov@linux.microsoft.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/mr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index 4f13423ecdbdf..887b09dd86e78 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -112,6 +112,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, "start 0x%llx, iova 0x%llx length 0x%llx access_flags 0x%x", start, iova, length, access_flags); + access_flags &= ~IB_ACCESS_OPTIONAL; if (access_flags & ~VALID_MR_FLAGS) return ERR_PTR(-EINVAL); From 339b84ab6b1d66900c27bd999271cb2ae40ce812 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Jun 2024 09:45:09 -0400 Subject: [PATCH 256/272] closures: Change BUG_ON() to WARN_ON() If a BUG_ON() can be hit in the wild, it shouldn't be a BUG_ON() For reference, this has popped up once in the CI, and we'll need more info to debug it: 03240 ------------[ cut here ]------------ 03240 kernel BUG at lib/closure.c:21! 03240 kernel BUG at lib/closure.c:21! 03240 Internal error: Oops - BUG: 00000000f2000800 [#1] SMP 03240 Modules linked in: 03240 CPU: 15 PID: 40534 Comm: kworker/u80:1 Not tainted 6.10.0-rc4-ktest-ga56da69799bd #25570 03240 Hardware name: linux,dummy-virt (DT) 03240 Workqueue: btree_update btree_interior_update_work 03240 pstate: 00001005 (nzcv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--) 03240 pc : closure_put+0x224/0x2a0 03240 lr : closure_put+0x24/0x2a0 03240 sp : ffff0000d12071c0 03240 x29: ffff0000d12071c0 x28: dfff800000000000 x27: ffff0000d1207360 03240 x26: 0000000000000040 x25: 0000000000000040 x24: 0000000000000040 03240 x23: ffff0000c1f20180 x22: 0000000000000000 x21: ffff0000c1f20168 03240 x20: 0000000040000000 x19: ffff0000c1f20140 x18: 0000000000000001 03240 x17: 0000000000003aa0 x16: 0000000000003ad0 x15: 1fffe0001c326974 03240 x14: 0000000000000a1e x13: 0000000000000000 x12: 1fffe000183e402d 03240 x11: ffff6000183e402d x10: dfff800000000000 x9 : ffff6000183e402e 03240 x8 : 0000000000000001 x7 : 00009fffe7c1bfd3 x6 : ffff0000c1f2016b 03240 x5 : ffff0000c1f20168 x4 : ffff6000183e402e x3 : ffff800081391954 03240 x2 : 0000000000000001 x1 : 0000000000000000 x0 : 00000000a8000000 03240 Call trace: 03240 closure_put+0x224/0x2a0 03240 bch2_check_for_deadlock+0x910/0x1028 03240 bch2_six_check_for_deadlock+0x1c/0x30 03240 six_lock_slowpath.isra.0+0x29c/0xed0 03240 six_lock_ip_waiter+0xa8/0xf8 03240 __bch2_btree_node_lock_write+0x14c/0x298 03240 bch2_trans_lock_write+0x6d4/0xb10 03240 __bch2_trans_commit+0x135c/0x5520 03240 btree_interior_update_work+0x1248/0x1c10 03240 process_scheduled_works+0x53c/0xd90 03240 worker_thread+0x370/0x8c8 03240 kthread+0x258/0x2e8 03240 ret_from_fork+0x10/0x20 03240 Code: aa1303e0 d63f0020 a94363f7 17ffff8c (d4210000) 03240 ---[ end trace 0000000000000000 ]--- 03240 Kernel panic - not syncing: Oops - BUG: Fatal exception 03240 SMP: stopping secondary CPUs 03241 SMP: failed to stop secondary CPUs 13,15 03241 Kernel Offset: disabled 03241 CPU features: 0x00,00000003,80000008,4240500b 03241 Memory Limit: none 03241 ---[ end Kernel panic - not syncing: Oops - BUG: Fatal exception ]--- 03246 ========= FAILED TIMEOUT copygc_torture_no_checksum in 7200s Signed-off-by: Kent Overstreet --- lib/closure.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/closure.c b/lib/closure.c index 07409e9e35a53..2e1ee9fdec081 100644 --- a/lib/closure.c +++ b/lib/closure.c @@ -17,12 +17,18 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) { int r = flags & CLOSURE_REMAINING_MASK; - BUG_ON(flags & CLOSURE_GUARD_MASK); - BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); + if (WARN(flags & CLOSURE_GUARD_MASK, + "closure has guard bits set: %x (%u)", + flags & CLOSURE_GUARD_MASK, (unsigned) __fls(r))) + r &= ~CLOSURE_GUARD_MASK; if (!r) { smp_acquire__after_ctrl_dep(); + WARN(flags & ~CLOSURE_DESTRUCTOR, + "closure ref hit 0 with incorrect flags set: %x (%u)", + flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags)); + cl->closure_get_happened = false; if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { From f648b6c12b70af9d24a293617102729cee6b7862 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Jun 2024 10:04:35 -0400 Subject: [PATCH 257/272] bcachefs: Fix missing alloc_data_type_set() Incorrect bucket state transition in the discard path; when incrementing a bucket's generation number that had already been discarded, we were forgetting to check if it should be need_gc_gens, not free. This was caught by the .invalid checks in the transaction commit path, causing us to go emergency read only. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 7b5909764d148..e5e7d33f4a5ef 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -776,6 +776,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { new_a->gen++; SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); + alloc_data_type_set(new_a, new_a->data_type); } if (old_a->data_type != new_a->data_type || @@ -1796,8 +1797,9 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, } SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - alloc_data_type_set(&a->v, a->v.data_type); write: + alloc_data_type_set(&a->v, a->v.data_type); + ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| From 504794067fc266be5ac170777a94a927a72ac846 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 26 May 2024 22:52:22 -0400 Subject: [PATCH 258/272] bcachefs: Replace bare EEXIST with private error codes Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 2 +- fs/bcachefs/errcode.h | 3 +++ fs/bcachefs/fs-ioctl.c | 2 +- fs/bcachefs/str_hash.h | 2 +- fs/bcachefs/super.c | 11 ++++++----- 5 files changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index e5e7d33f4a5ef..8dec2c6cbb7eb 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1643,7 +1643,7 @@ static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket) mutex_lock(&c->discard_buckets_in_flight_lock); darray_for_each(c->discard_buckets_in_flight, i) if (bkey_eq(*i, bucket)) { - ret = -EEXIST; + ret = -BCH_ERR_EEXIST_discard_in_flight_add; goto out; } diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index dbe35b80bc0b8..58612abf7927a 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -116,6 +116,9 @@ x(ENOENT, ENOENT_dev_idx_not_found) \ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ + x(EEXIST, EEXIST_str_hash_set) \ + x(EEXIST, EEXIST_discard_in_flight_add) \ + x(EEXIST, EEXIST_subvolume_create) \ x(0, open_buckets_empty) \ x(0, freelist_empty) \ x(BCH_ERR_freelist_empty, no_buckets_found) \ diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 3551a737181b2..79a0c8732bced 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -373,7 +373,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, } if (dst_dentry->d_inode) { - error = -EEXIST; + error = -BCH_ERR_EEXIST_subvolume_create; goto err3; } diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index cbad9b27874fe..c8c266cb57972 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -300,7 +300,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, if (!found && (flags & STR_HASH_must_replace)) { ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; } else if (found && (flags & STR_HASH_must_create)) { - ret = -EEXIST; + ret = -BCH_ERR_EEXIST_str_hash_set; } else { if (!found && slot.path) swap(iter, slot); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 635da5b3439cf..9083df82073a5 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -931,12 +931,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; - for (i = 0; i < c->sb.nr_devices; i++) - if (bch2_member_exists(c->disk_sb.sb, i) && - bch2_dev_alloc(c, i)) { - ret = -EEXIST; + for (i = 0; i < c->sb.nr_devices; i++) { + if (!bch2_member_exists(c->disk_sb.sb, i)) + continue; + ret = bch2_dev_alloc(c, i); + if (ret) goto err; - } + } bch2_journal_entry_res_resize(&c->journal, &c->btree_root_journal_res, From dd9086487c1bb38641bcfbe765422c7f0a1a8d95 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Jun 2024 13:20:49 -0400 Subject: [PATCH 259/272] bcachefs: Fix I_NEW warning in race path in bch2_inode_insert() discard_new_inode() is the correct interface for tearing down an indoe that was fully created but not made visible to other threads, but it expects I_NEW to be set, which we don't use. Reported-by: https://github.com/koverstreet/bcachefs/issues/690 Fixes: bcachefs: Fix race path in bch2_inode_insert() Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 8314d3e1582d3..615ef8305c6eb 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -188,6 +188,12 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino BUG_ON(!old); if (unlikely(old != inode)) { + /* + * bcachefs doesn't use I_NEW; we have no use for it since we + * only insert fully created inodes in the inode hash table. But + * discard_new_inode() expects it to be set... + */ + inode->v.i_flags |= I_NEW; discard_new_inode(&inode->v); inode = old; } else { @@ -195,8 +201,10 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); mutex_unlock(&c->vfs_inodes_lock); /* - * we really don't want insert_inode_locked2() to be setting - * I_NEW... + * Again, I_NEW makes no sense for bcachefs. This is only needed + * for clearing I_NEW, but since the inode was already fully + * created and initialized we didn't actually want + * inode_insert5() to set it for us. */ unlock_new_inode(&inode->v); } From e6b3a655ac7ba5282b1504851488236865804cb8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Jun 2024 13:10:34 -0400 Subject: [PATCH 260/272] bcachefs: Use bch2_print_string_as_lines for long err printk strings get truncated to 1024 bytes; if we have a long error message (journal debug info) we need to use a helper. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index cdcb1ad49af42..492426c8d869a 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1967,7 +1967,6 @@ CLOSURE_CALLBACK(bch2_journal_write) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_replicas_padded replicas; - struct printbuf journal_debug_buf = PRINTBUF; unsigned nr_rw_members = 0; int ret; @@ -2011,11 +2010,15 @@ CLOSURE_CALLBACK(bch2_journal_write) } if (ret) { - __bch2_journal_debug_to_text(&journal_debug_buf, j); + struct printbuf buf = PRINTBUF; + buf.atomic++; + + prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write: %s"), + bch2_err_str(ret)); + __bch2_journal_debug_to_text(&buf, j); spin_unlock(&j->lock); - bch_err(c, "Unable to allocate journal write:\n%s", - journal_debug_buf.buf); - printbuf_exit(&journal_debug_buf); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); goto err; } From 2fe79ce7d1e8ec5059e7dfc15f3c769ae9679569 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Jun 2024 19:42:39 -0400 Subject: [PATCH 261/272] bcachefs: Fix a UAF after write_super() write_super() may reallocate the superblock buffer - but bch_sb_field_ext was referencing it; don't use it after the write_super call. Reported-by: syzbot+8992fc10a192067b8d8a@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index e632da69196cc..1f9d044ed9207 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -664,10 +664,10 @@ int bch2_fs_recovery(struct bch_fs *c) if (check_version_upgrade(c)) write_sb = true; + c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + if (write_sb) bch2_write_super(c); - - c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); mutex_unlock(&c->sb_lock); if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) From bd4da0462ea7bf26b2a5df5528ec20c550f7ec41 Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Tue, 4 Jun 2024 16:46:10 +0800 Subject: [PATCH 262/272] bcachefs: Move the ei_flags setting to after initialization `inode->ei_flags` setting and cleaning should be done after initialization, otherwise the operation is invalid. Fixes: 9ca4853b98af ("bcachefs: Fix quota support for snapshots") Signed-off-by: Youling Tang Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 615ef8305c6eb..f9c9a95d7d4ca 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1497,11 +1497,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, bch2_iget5_set(&inode->v, &inum); bch2_inode_update_after_write(trans, inode, bi, ~0); - if (BCH_SUBVOLUME_SNAP(subvol)) - set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - else - clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - inode->v.i_blocks = bi->bi_sectors; inode->v.i_ino = bi->bi_inum; inode->v.i_rdev = bi->bi_dev; @@ -1513,6 +1508,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, inode->ei_qid = bch_qid(bi); inode->ei_subvol = inum.subvol; + if (BCH_SUBVOLUME_SNAP(subvol)) + set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); + inode->v.i_mapping->a_ops = &bch_address_space_operations; switch (inode->v.i_mode & S_IFMT) { From c45fcf46ca2368dafe7e5c513a711a6f0f974308 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 21 Jun 2024 16:37:12 +0200 Subject: [PATCH 263/272] pwm: stm32: Refuse too small period requests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If period_ns is small, prd might well become 0. Catch that case because otherwise with regmap_write(priv->regmap, TIM_ARR, prd - 1); a few lines down quite a big period is configured. Fixes: 7edf7369205b ("pwm: Add driver for STM32 plaftorm") Cc: stable@vger.kernel.org Reviewed-by: Trevor Gamblin Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/b86f62f099983646f97eeb6bfc0117bb2d0c340d.1718979150.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-stm32.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c index a2f231d13a9f7..3e7b2a8e34e7d 100644 --- a/drivers/pwm/pwm-stm32.c +++ b/drivers/pwm/pwm-stm32.c @@ -337,6 +337,8 @@ static int stm32_pwm_config(struct stm32_pwm *priv, unsigned int ch, prd = mul_u64_u64_div_u64(period_ns, clk_get_rate(priv->clk), (u64)NSEC_PER_SEC * (prescaler + 1)); + if (!prd) + return -EINVAL; /* * All channels share the same prescaler and counter so when two From d18b822c8f622ed37af7130088a0b7f1eb0b16e6 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 21 Jun 2024 09:30:08 +0200 Subject: [PATCH 264/272] docs: i2c: summary: start sentences consistently. Change the first paragraphs to contain only one space after the end of the previous sentence like in the rest of the document. Signed-off-by: Wolfram Sang Reviewed-by: Easwar Hariharan Signed-off-by: Wolfram Sang --- Documentation/i2c/summary.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/i2c/summary.rst b/Documentation/i2c/summary.rst index 786c618ba3bef..28ff80a2302be 100644 --- a/Documentation/i2c/summary.rst +++ b/Documentation/i2c/summary.rst @@ -4,10 +4,10 @@ Introduction to I2C and SMBus I²C (pronounce: I squared C and written I2C in the kernel documentation) is a protocol developed by Philips. It is a slow two-wire protocol (variable -speed, up to 400 kHz), with a high speed extension (3.4 MHz). It provides +speed, up to 400 kHz), with a high speed extension (3.4 MHz). It provides an inexpensive bus for connecting many types of devices with infrequent or -low bandwidth communications needs. I2C is widely used with embedded -systems. Some systems use variants that don't meet branding requirements, +low bandwidth communications needs. I2C is widely used with embedded +systems. Some systems use variants that don't meet branding requirements, and so are not advertised as being I2C but come under different names, e.g. TWI (Two Wire Interface), IIC. @@ -18,14 +18,14 @@ access the PDF. An older version of the specification (revision 6) is archived `here `_. SMBus (System Management Bus) is based on the I2C protocol, and is mostly -a subset of I2C protocols and signaling. Many I2C devices will work on an +a subset of I2C protocols and signaling. Many I2C devices will work on an SMBus, but some SMBus protocols add semantics beyond what is required to -achieve I2C branding. Modern PC mainboards rely on SMBus. The most common +achieve I2C branding. Modern PC mainboards rely on SMBus. The most common devices connected through SMBus are RAM modules configured using I2C EEPROMs, and hardware monitoring chips. Because the SMBus is mostly a subset of the generalized I2C bus, we can -use its protocols on many I2C systems. However, there are systems that don't +use its protocols on many I2C systems. However, there are systems that don't meet both SMBus and I2C electrical constraints; and others which can't implement all the common SMBus protocol semantics or messages. From 75d148c90a34b94a3e3e7e7b2f30a689d8fbb7c8 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 21 Jun 2024 09:30:09 +0200 Subject: [PATCH 265/272] docs: i2c: summary: update I2C specification link MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Luckily, the specs are directly downloadable again, so update the link. Also update its title to the original name "I²C". Signed-off-by: Wolfram Sang Reviewed-by: Easwar Hariharan Signed-off-by: Wolfram Sang --- Documentation/i2c/summary.rst | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Documentation/i2c/summary.rst b/Documentation/i2c/summary.rst index 28ff80a2302be..e3ab1d414014d 100644 --- a/Documentation/i2c/summary.rst +++ b/Documentation/i2c/summary.rst @@ -11,11 +11,9 @@ systems. Some systems use variants that don't meet branding requirements, and so are not advertised as being I2C but come under different names, e.g. TWI (Two Wire Interface), IIC. -The latest official I2C specification is the `"I2C-bus specification and user -manual" (UM10204) `_ -published by NXP Semiconductors. However, you need to log-in to the site to -access the PDF. An older version of the specification (revision 6) is archived -`here `_. +The latest official I2C specification is the `"I²C-bus specification and user +manual" (UM10204) `_ +published by NXP Semiconductors, version 7 as of this writing. SMBus (System Management Bus) is based on the I2C protocol, and is mostly a subset of I2C protocols and signaling. Many I2C devices will work on an From a5b88cb9fdff337a2867f0dff7c5cd23d4bd6663 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 21 Jun 2024 09:30:10 +0200 Subject: [PATCH 266/272] docs: i2c: summary: update speed mode description Fastest I2C mode is 5 MHz. Update the docs and reword the paragraph slightly. Signed-off-by: Wolfram Sang Reviewed-by: Easwar Hariharan Signed-off-by: Wolfram Sang --- Documentation/i2c/summary.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/i2c/summary.rst b/Documentation/i2c/summary.rst index e3ab1d414014d..a1e5c0715f8b4 100644 --- a/Documentation/i2c/summary.rst +++ b/Documentation/i2c/summary.rst @@ -3,8 +3,8 @@ Introduction to I2C and SMBus ============================= I²C (pronounce: I squared C and written I2C in the kernel documentation) is -a protocol developed by Philips. It is a slow two-wire protocol (variable -speed, up to 400 kHz), with a high speed extension (3.4 MHz). It provides +a protocol developed by Philips. It is a two-wire protocol with variable +speed (typically up to 400 kHz, high speed modes up to 5 MHz). It provides an inexpensive bus for connecting many types of devices with infrequent or low bandwidth communications needs. I2C is widely used with embedded systems. Some systems use variants that don't meet branding requirements, From d77367fff7c0d67e20393a8236b519d5c48ee875 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 21 Jun 2024 09:30:11 +0200 Subject: [PATCH 267/272] docs: i2c: summary: document use of inclusive language We now have the updated I2C specs and our own Code of Conduct, so we have all we need to switch over to the inclusive terminology. Define them here. Signed-off-by: Wolfram Sang Reviewed-by: Easwar Hariharan Signed-off-by: Wolfram Sang --- Documentation/i2c/i2c_bus.svg | 15 ++++++++------- Documentation/i2c/summary.rst | 23 +++++++++++++++++------ 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/Documentation/i2c/i2c_bus.svg b/Documentation/i2c/i2c_bus.svg index 3170de976373c..45801de4af7d9 100644 --- a/Documentation/i2c/i2c_bus.svg +++ b/Documentation/i2c/i2c_bus.svg @@ -1,5 +1,6 @@ + I2CMaster + id="tspan1285">Controller Slave + id="tspan1287">Target Slave + id="tspan1287-6">Target Slave + id="tspan1287-9">Target Date: Fri, 21 Jun 2024 09:30:12 +0200 Subject: [PATCH 268/272] docs: i2c: summary: document 'local' and 'remote' targets Because Linux can be a target as well, add terminology to differentiate between Linux being the target and Linux accessing targets. Signed-off-by: Wolfram Sang Reviewed-by: Easwar Hariharan Signed-off-by: Wolfram Sang --- Documentation/i2c/summary.rst | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/Documentation/i2c/summary.rst b/Documentation/i2c/summary.rst index a6da1032fa065..ff8bda32b9c3d 100644 --- a/Documentation/i2c/summary.rst +++ b/Documentation/i2c/summary.rst @@ -49,10 +49,15 @@ whole class of I2C adapters. Each specific adapter driver either depends on an algorithm driver in the ``drivers/i2c/algos/`` subdirectory, or includes its own implementation. -A **target** chip is a node that responds to communications when addressed -by the controller. In Linux it is called a **client**. Client drivers are kept -in a directory specific to the feature they provide, for example -``drivers/media/gpio/`` for GPIO expanders and ``drivers/media/i2c/`` for +A **target** chip is a node that responds to communications when addressed by a +controller. In the Linux kernel implementation it is called a **client**. While +targets are usually separate external chips, Linux can also act as a target +(needs hardware support) and respond to another controller on the bus. This is +then called a **local target**. In contrast, an external chip is called a +**remote target**. + +Target drivers are kept in a directory specific to the feature they provide, +for example ``drivers/gpio/`` for GPIO expanders and ``drivers/media/i2c/`` for video-related chips. For the example configuration in figure, you will need a driver for your From 20738cb9fa7ad74c4f374c5b49c8189277df3a9d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 21 Jun 2024 09:30:13 +0200 Subject: [PATCH 269/272] docs: i2c: summary: be clearer with 'controller/target' and 'adapter/client' pairs This not only includes rewording, but also where to put which emphasis on terms in this document. Signed-off-by: Wolfram Sang Reviewed-by: Easwar Hariharan Signed-off-by: Wolfram Sang --- Documentation/i2c/summary.rst | 41 ++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/Documentation/i2c/summary.rst b/Documentation/i2c/summary.rst index ff8bda32b9c3d..579a1c7df200e 100644 --- a/Documentation/i2c/summary.rst +++ b/Documentation/i2c/summary.rst @@ -31,9 +31,7 @@ implement all the common SMBus protocol semantics or messages. Terminology =========== -The I2C bus connects one or more *controller* chips and one or more *target* -chips. - +The I2C bus connects one or more controller chips and one or more target chips. .. kernel-figure:: i2c_bus.svg :alt: Simple I2C bus with one controller and 3 targets @@ -41,28 +39,37 @@ chips. Simple I2C bus A **controller** chip is a node that starts communications with targets. In the -Linux kernel implementation it is called an **adapter** or bus. Adapter -drivers are in the ``drivers/i2c/busses/`` subdirectory. +Linux kernel implementation it is also called an "adapter" or "bus". Controller +drivers are usually in the ``drivers/i2c/busses/`` subdirectory. -An **algorithm** contains general code that can be used to implement a -whole class of I2C adapters. Each specific adapter driver either depends on -an algorithm driver in the ``drivers/i2c/algos/`` subdirectory, or includes -its own implementation. +An **algorithm** contains general code that can be used to implement a whole +class of I2C controllers. Each specific controller driver either depends on an +algorithm driver in the ``drivers/i2c/algos/`` subdirectory, or includes its +own implementation. A **target** chip is a node that responds to communications when addressed by a -controller. In the Linux kernel implementation it is called a **client**. While -targets are usually separate external chips, Linux can also act as a target -(needs hardware support) and respond to another controller on the bus. This is -then called a **local target**. In contrast, an external chip is called a -**remote target**. +controller. In the Linux kernel implementation it is also called a "client". +While targets are usually separate external chips, Linux can also act as a +target (needs hardware support) and respond to another controller on the bus. +This is then called a **local target**. In contrast, an external chip is called +a **remote target**. Target drivers are kept in a directory specific to the feature they provide, for example ``drivers/gpio/`` for GPIO expanders and ``drivers/media/i2c/`` for video-related chips. -For the example configuration in figure, you will need a driver for your -I2C adapter, and drivers for your I2C devices (usually one driver for each -device). +For the example configuration in the figure above, you will need one driver for +the I2C controller, and drivers for your I2C targets. Usually one driver for +each target. + +Synonyms +-------- + +As mentioned above, the Linux I2C implementation historically uses the terms +"adapter" for controller and "client" for target. A number of data structures +have these synonyms in their name. So, when discussing implementation details, +you should be aware of these terms as well. The official wording is preferred, +though. Outdated terminology -------------------- From dab8f9f0fe3aada61c0eb013dcf7d3ff75a2c336 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 21 Jun 2024 16:37:13 +0200 Subject: [PATCH 270/272] pwm: stm32: Fix calculation of prescaler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A small prescaler is beneficial, as this improves the resolution of the duty_cycle configuration. However if the prescaler is too small, the maximal possible period becomes considerably smaller than the requested value. One situation where this goes wrong is the following: With a parent clock rate of 208877930 Hz and max_arr = 0xffff = 65535, a request for period = 941243 ns currently results in PSC = 1. The value for ARR is then calculated to ARR = 941243 * 208877930 / (1000000000 * 2) - 1 = 98301 This value is bigger than 65535 however and so doesn't fit into the respective register field. In this particular case the PWM was configured for a period of 313733.4806027616 ns (with ARR = 98301 & 0xffff). Even if ARR was configured to its maximal value, only period = 627495.6861167669 ns would be achievable. Fix the calculation accordingly and adapt the comment to match the new algorithm. With the calculation fixed the above case results in PSC = 2 and so an actual period of 941229.1667195285 ns. Fixes: 8002fbeef1e4 ("pwm: stm32: Calculate prescaler with a division instead of a loop") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/b4d96b79917617434a540df45f20cb5de4142f88.1718979150.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-stm32.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c index 3e7b2a8e34e7d..97d3de24f312f 100644 --- a/drivers/pwm/pwm-stm32.c +++ b/drivers/pwm/pwm-stm32.c @@ -321,17 +321,23 @@ static int stm32_pwm_config(struct stm32_pwm *priv, unsigned int ch, * First we need to find the minimal value for prescaler such that * * period_ns * clkrate - * ------------------------------ + * ------------------------------ < max_arr + 1 * NSEC_PER_SEC * (prescaler + 1) * - * isn't bigger than max_arr. + * This equation is equivalent to + * + * period_ns * clkrate + * ---------------------------- < prescaler + 1 + * NSEC_PER_SEC * (max_arr + 1) + * + * Using integer division and knowing that the right hand side is + * integer, this is further equivalent to + * + * (period_ns * clkrate) // (NSEC_PER_SEC * (max_arr + 1)) ≤ prescaler */ prescaler = mul_u64_u64_div_u64(period_ns, clk_get_rate(priv->clk), - (u64)NSEC_PER_SEC * priv->max_arr); - if (prescaler > 0) - prescaler -= 1; - + (u64)NSEC_PER_SEC * ((u64)priv->max_arr + 1)); if (prescaler > MAX_TIM_PSC) return -EINVAL; From f01af3022d4a46362c5dda3d35dea939f3246d10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 21 Jun 2024 16:37:14 +0200 Subject: [PATCH 271/272] pwm: stm32: Fix error message to not describe the previous error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "Failed to lock the clock" is an appropriate error message for clk_rate_exclusive_get() failing, but not for the clock running too fast for the driver's calculations. Adapt the error message accordingly. Fixes: d44d635635a7 ("pwm: stm32: Fix for settings using period > UINT32_MAX") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/285182163211203fc823a65b180761f46e828dcb.1718979150.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-stm32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c index 97d3de24f312f..8bae3fd2b3306 100644 --- a/drivers/pwm/pwm-stm32.c +++ b/drivers/pwm/pwm-stm32.c @@ -681,7 +681,8 @@ static int stm32_pwm_probe(struct platform_device *pdev) * .apply() won't overflow. */ if (clk_get_rate(priv->clk) > 1000000000) - return dev_err_probe(dev, -EINVAL, "Failed to lock clock\n"); + return dev_err_probe(dev, -EINVAL, "Clock freq too high (%lu)\n", + clk_get_rate(priv->clk)); chip->ops = &stm32pwm_ops; From f2661062f16b2de5d7b6a5c42a9a5c96326b8454 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 23 Jun 2024 17:08:54 -0400 Subject: [PATCH 272/272] Linux 6.10-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 14427547dc1eb..4d36f943b3b1f 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 10 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Baby Opossum Posse # *DOCUMENTATION*