From 6e30a7c98a9fda2f894e970e9cd637657f39c59d Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 5 Jun 2024 20:13:15 +0200 Subject: [PATCH 001/352] locking/atomic/x86: Introduce the read64_nonatomic macro to x86_32 with cx8 As described in commit: e73c4e34a0e9 ("locking/atomic/x86: Introduce arch_atomic64_read_nonatomic() to x86_32") the value preload before the CMPXCHG loop does not need to be atomic. Introduce the read64_nonatomic assembly macro to load the value from a atomic_t location in a faster non-atomic way and use it in atomic64_cx8_32.S. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240605181424.3228-1-ubizjak@gmail.com --- arch/x86/lib/atomic64_cx8_32.S | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index 90afb488b396a..b2eff07d65e47 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -16,6 +16,11 @@ cmpxchg8b (\reg) .endm +.macro read64_nonatomic reg + movl (\reg), %eax + movl 4(\reg), %edx +.endm + SYM_FUNC_START(atomic64_read_cx8) read64 %ecx RET @@ -51,7 +56,7 @@ SYM_FUNC_START(atomic64_\func\()_return_cx8) movl %edx, %edi movl %ecx, %ebp - read64 %ecx + read64_nonatomic %ecx 1: movl %eax, %ebx movl %edx, %ecx @@ -79,7 +84,7 @@ addsub_return sub sub sbb SYM_FUNC_START(atomic64_\func\()_return_cx8) pushl %ebx - read64 %esi + read64_nonatomic %esi 1: movl %eax, %ebx movl %edx, %ecx From dce2a224763ce968445e14c43b49321936309c75 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 5 Jun 2024 20:13:16 +0200 Subject: [PATCH 002/352] locking/atomic/x86: Redeclare x86_32 arch_atomic64_{add,sub}() as void Correct the return type of x86_32 arch_atomic64_add() and arch_atomic64_sub() functions to 'void' and remove redundant return. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240605181424.3228-2-ubizjak@gmail.com --- arch/x86/include/asm/atomic64_32.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 8db2ec4d6cdac..1f650b4dde509 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -163,20 +163,18 @@ static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v) } #define arch_atomic64_dec_return arch_atomic64_dec_return -static __always_inline s64 arch_atomic64_add(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { __alternative_atomic64(add, add_return, ASM_OUTPUT2("+A" (i), "+c" (v)), ASM_NO_INPUT_CLOBBER("memory")); - return i; } -static __always_inline s64 arch_atomic64_sub(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v) { __alternative_atomic64(sub, sub_return, ASM_OUTPUT2("+A" (i), "+c" (v)), ASM_NO_INPUT_CLOBBER("memory")); - return i; } static __always_inline void arch_atomic64_inc(atomic64_t *v) From 7886a61ebc1f3998df5950299cbe17272bf32c59 Mon Sep 17 00:00:00 2001 From: "Luis Claudio R. Goncalves" Date: Tue, 30 Jul 2024 14:45:47 -0300 Subject: [PATCH 003/352] lockdep: suggest the fix for "lockdep bfs error:-1" on print_bfs_bug When lockdep fails while performing the Breadth-first-search operation due to lack of memory, hint that increasing the value of the config switch LOCKDEP_CIRCULAR_QUEUE_BITS should fix the warning. Preface the scary backtrace with the suggestion: [ 163.849242] Increase LOCKDEP_CIRCULAR_QUEUE_BITS to avoid this warning: [ 163.849248] ------------[ cut here ]------------ [ 163.849250] lockdep bfs error:-1 [ 163.849263] WARNING: CPU: 24 PID: 2454 at kernel/locking/lockdep.c:2091 print_bfs_bug+0x27/0x40 ... Signed-off-by: Luis Claudio R. Goncalves Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Boqun Feng Reviewed-by: Waiman Long Link: https://lkml.kernel.org/r/Zqkmy0lS-9Sw0M9j@uudg.org --- kernel/locking/lockdep.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 726b22ce7d0b0..fee21f3c3b997 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2067,6 +2067,9 @@ static noinline void print_bfs_bug(int ret) /* * Breadth-first-search failed, graph got corrupted? */ + if (ret == BFS_EQUEUEFULL) + pr_warn("Increase LOCKDEP_CIRCULAR_QUEUE_BITS to avoid this warning:\n"); + WARN(1, "lockdep bfs error:%d\n", ret); } From d5934e76316e84eced836b6b2bafae1837d1cd58 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 29 Mar 2024 16:48:48 -0700 Subject: [PATCH 004/352] cleanup: Add usage and style documentation When proposing that PCI grow some new cleanup helpers for pci_dev_put() and pci_dev_{lock,unlock} [1], Bjorn had some fundamental questions about expectations and best practices. Upon reviewing an updated changelog with those details he recommended adding them to documentation in the header file itself. Add that documentation and link it into the rendering for Documentation/core-api/. Signed-off-by: Dan Williams Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jonathan Cameron Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/171175585714.2192972.12661675876300167762.stgit@dwillia2-xfh.jf.intel.com --- Documentation/core-api/cleanup.rst | 8 ++ Documentation/core-api/index.rst | 1 + include/linux/cleanup.h | 136 +++++++++++++++++++++++++++++ 3 files changed, 145 insertions(+) create mode 100644 Documentation/core-api/cleanup.rst diff --git a/Documentation/core-api/cleanup.rst b/Documentation/core-api/cleanup.rst new file mode 100644 index 0000000000000..527eb2f8ec6ef --- /dev/null +++ b/Documentation/core-api/cleanup.rst @@ -0,0 +1,8 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================== +Scope-based Cleanup Helpers +=========================== + +.. kernel-doc:: include/linux/cleanup.h + :doc: scope-based cleanup helpers diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index f147854700e44..b99d2fb3e2f14 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -35,6 +35,7 @@ Library functionality that is used throughout the kernel. kobject kref + cleanup assoc_array xarray maple_tree diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index d9e613803df15..9c6b4f2c01765 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -4,6 +4,142 @@ #include +/** + * DOC: scope-based cleanup helpers + * + * The "goto error" pattern is notorious for introducing subtle resource + * leaks. It is tedious and error prone to add new resource acquisition + * constraints into code paths that already have several unwind + * conditions. The "cleanup" helpers enable the compiler to help with + * this tedium and can aid in maintaining LIFO (last in first out) + * unwind ordering to avoid unintentional leaks. + * + * As drivers make up the majority of the kernel code base, here is an + * example of using these helpers to clean up PCI drivers. The target of + * the cleanups are occasions where a goto is used to unwind a device + * reference (pci_dev_put()), or unlock the device (pci_dev_unlock()) + * before returning. + * + * The DEFINE_FREE() macro can arrange for PCI device references to be + * dropped when the associated variable goes out of scope:: + * + * DEFINE_FREE(pci_dev_put, struct pci_dev *, if (_T) pci_dev_put(_T)) + * ... + * struct pci_dev *dev __free(pci_dev_put) = + * pci_get_slot(parent, PCI_DEVFN(0, 0)); + * + * The above will automatically call pci_dev_put() if @dev is non-NULL + * when @dev goes out of scope (automatic variable scope). If a function + * wants to invoke pci_dev_put() on error, but return @dev (i.e. without + * freeing it) on success, it can do:: + * + * return no_free_ptr(dev); + * + * ...or:: + * + * return_ptr(dev); + * + * The DEFINE_GUARD() macro can arrange for the PCI device lock to be + * dropped when the scope where guard() is invoked ends:: + * + * DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T)) + * ... + * guard(pci_dev)(dev); + * + * The lifetime of the lock obtained by the guard() helper follows the + * scope of automatic variable declaration. Take the following example:: + * + * func(...) + * { + * if (...) { + * ... + * guard(pci_dev)(dev); // pci_dev_lock() invoked here + * ... + * } // <- implied pci_dev_unlock() triggered here + * } + * + * Observe the lock is held for the remainder of the "if ()" block not + * the remainder of "func()". + * + * Now, when a function uses both __free() and guard(), or multiple + * instances of __free(), the LIFO order of variable definition order + * matters. GCC documentation says: + * + * "When multiple variables in the same scope have cleanup attributes, + * at exit from the scope their associated cleanup functions are run in + * reverse order of definition (last defined, first cleanup)." + * + * When the unwind order matters it requires that variables be defined + * mid-function scope rather than at the top of the file. Take the + * following example and notice the bug highlighted by "!!":: + * + * LIST_HEAD(list); + * DEFINE_MUTEX(lock); + * + * struct object { + * struct list_head node; + * }; + * + * static struct object *alloc_add(void) + * { + * struct object *obj; + * + * lockdep_assert_held(&lock); + * obj = kzalloc(sizeof(*obj), GFP_KERNEL); + * if (obj) { + * LIST_HEAD_INIT(&obj->node); + * list_add(obj->node, &list): + * } + * return obj; + * } + * + * static void remove_free(struct object *obj) + * { + * lockdep_assert_held(&lock); + * list_del(&obj->node); + * kfree(obj); + * } + * + * DEFINE_FREE(remove_free, struct object *, if (_T) remove_free(_T)) + * static int init(void) + * { + * struct object *obj __free(remove_free) = NULL; + * int err; + * + * guard(mutex)(&lock); + * obj = alloc_add(); + * + * if (!obj) + * return -ENOMEM; + * + * err = other_init(obj); + * if (err) + * return err; // remove_free() called without the lock!! + * + * no_free_ptr(obj); + * return 0; + * } + * + * That bug is fixed by changing init() to call guard() and define + + * initialize @obj in this order:: + * + * guard(mutex)(&lock); + * struct object *obj __free(remove_free) = alloc_add(); + * + * Given that the "__free(...) = NULL" pattern for variables defined at + * the top of the function poses this potential interdependency problem + * the recommendation is to always define and assign variables in one + * statement and not group variable definitions at the top of the + * function when __free() is used. + * + * Lastly, given that the benefit of cleanup helpers is removal of + * "goto", and that the "goto" statement can jump between scopes, the + * expectation is that usage of "goto" and cleanup helpers is never + * mixed in the same function. I.e. for a given routine, convert all + * resources that need a "goto" cleanup to scope-based cleanup, or + * convert none of them. + */ + /* * DEFINE_FREE(name, type, free): * simple helper macro that defines the required wrapper for a __free() From 13c267f0c27e35ee9372d3cf0dde1ea09db02f13 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 28 May 2024 14:00:09 +0200 Subject: [PATCH 005/352] lockdep: Use str_plural() to fix Coccinelle warning Fixes the following Coccinelle/coccicheck warning reported by string_choices.cocci: opportunity for str_plural(depth) Acked-by: Waiman Long Signed-off-by: Thorsten Blum Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20240528120008.403511-2-thorsten.blum@toblux.com --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index fee21f3c3b997..266f57f36f697 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -785,7 +785,7 @@ static void lockdep_print_held_locks(struct task_struct *p) printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p)); else printk("%d lock%s held by %s/%d:\n", depth, - depth > 1 ? "s" : "", p->comm, task_pid_nr(p)); + str_plural(depth), p->comm, task_pid_nr(p)); /* * It's not reliable to print a task's held locks if it's not sleeping * and it's not the current task. From a6f88ac32c6e63e69c595bfae220d8641704c9b7 Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Thu, 20 Jun 2024 22:54:34 +0000 Subject: [PATCH 006/352] lockdep: fix deadlock issue between lockdep and rcu There is a deadlock scenario between lockdep and rcu when rcu nocb feature is enabled, just as following call stack: rcuop/x -000|queued_spin_lock_slowpath(lock = 0xFFFFFF817F2A8A80, val = ?) -001|queued_spin_lock(inline) // try to hold nocb_gp_lock -001|do_raw_spin_lock(lock = 0xFFFFFF817F2A8A80) -002|__raw_spin_lock_irqsave(inline) -002|_raw_spin_lock_irqsave(lock = 0xFFFFFF817F2A8A80) -003|wake_nocb_gp_defer(inline) -003|__call_rcu_nocb_wake(rdp = 0xFFFFFF817F30B680) -004|__call_rcu_common(inline) -004|call_rcu(head = 0xFFFFFFC082EECC28, func = ?) -005|call_rcu_zapped(inline) -005|free_zapped_rcu(ch = ?)// hold graph lock -006|rcu_do_batch(rdp = 0xFFFFFF817F245680) -007|nocb_cb_wait(inline) -007|rcu_nocb_cb_kthread(arg = 0xFFFFFF817F245680) -008|kthread(_create = 0xFFFFFF80803122C0) -009|ret_from_fork(asm) rcuop/y -000|queued_spin_lock_slowpath(lock = 0xFFFFFFC08291BBC8, val = 0) -001|queued_spin_lock() -001|lockdep_lock() -001|graph_lock() // try to hold graph lock -002|lookup_chain_cache_add() -002|validate_chain() -003|lock_acquire -004|_raw_spin_lock_irqsave(lock = 0xFFFFFF817F211D80) -005|lock_timer_base(inline) -006|mod_timer(inline) -006|wake_nocb_gp_defer(inline)// hold nocb_gp_lock -006|__call_rcu_nocb_wake(rdp = 0xFFFFFF817F2A8680) -007|__call_rcu_common(inline) -007|call_rcu(head = 0xFFFFFFC0822E0B58, func = ?) -008|call_rcu_hurry(inline) -008|rcu_sync_call(inline) -008|rcu_sync_func(rhp = 0xFFFFFFC0822E0B58) -009|rcu_do_batch(rdp = 0xFFFFFF817F266680) -010|nocb_cb_wait(inline) -010|rcu_nocb_cb_kthread(arg = 0xFFFFFF817F266680) -011|kthread(_create = 0xFFFFFF8080363740) -012|ret_from_fork(asm) rcuop/x and rcuop/y are rcu nocb threads with the same nocb gp thread. This patch release the graph lock before lockdep call_rcu. Fixes: a0b0fd53e1e6 ("locking/lockdep: Free lock classes that are no longer in use") Cc: stable@vger.kernel.org Cc: Boqun Feng Cc: Waiman Long Cc: Carlos Llamas Cc: Bart Van Assche Signed-off-by: Zhiguo Niu Signed-off-by: Xuewen Yan Reviewed-by: Waiman Long Reviewed-by: Carlos Llamas Reviewed-by: Bart Van Assche Signed-off-by: Carlos Llamas Acked-by: Paul E. McKenney Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20240620225436.3127927-1-cmllamas@google.com --- kernel/locking/lockdep.c | 48 ++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 266f57f36f697..b172ead28f1c5 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -6186,25 +6186,27 @@ static struct pending_free *get_pending_free(void) static void free_zapped_rcu(struct rcu_head *cb); /* - * Schedule an RCU callback if no RCU callback is pending. Must be called with - * the graph lock held. - */ -static void call_rcu_zapped(struct pending_free *pf) +* See if we need to queue an RCU callback, must called with +* the lockdep lock held, returns false if either we don't have +* any pending free or the callback is already scheduled. +* Otherwise, a call_rcu() must follow this function call. +*/ +static bool prepare_call_rcu_zapped(struct pending_free *pf) { WARN_ON_ONCE(inside_selftest()); if (list_empty(&pf->zapped)) - return; + return false; if (delayed_free.scheduled) - return; + return false; delayed_free.scheduled = true; WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf); delayed_free.index ^= 1; - call_rcu(&delayed_free.rcu_head, free_zapped_rcu); + return true; } /* The caller must hold the graph lock. May be called from RCU context. */ @@ -6230,6 +6232,7 @@ static void free_zapped_rcu(struct rcu_head *ch) { struct pending_free *pf; unsigned long flags; + bool need_callback; if (WARN_ON_ONCE(ch != &delayed_free.rcu_head)) return; @@ -6241,14 +6244,18 @@ static void free_zapped_rcu(struct rcu_head *ch) pf = delayed_free.pf + (delayed_free.index ^ 1); __free_zapped_classes(pf); delayed_free.scheduled = false; + need_callback = + prepare_call_rcu_zapped(delayed_free.pf + delayed_free.index); + lockdep_unlock(); + raw_local_irq_restore(flags); /* - * If there's anything on the open list, close and start a new callback. - */ - call_rcu_zapped(delayed_free.pf + delayed_free.index); + * If there's pending free and its callback has not been scheduled, + * queue an RCU callback. + */ + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); - lockdep_unlock(); - raw_local_irq_restore(flags); } /* @@ -6288,6 +6295,7 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size) { struct pending_free *pf; unsigned long flags; + bool need_callback; init_data_structures_once(); @@ -6295,10 +6303,11 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size) lockdep_lock(); pf = get_pending_free(); __lockdep_free_key_range(pf, start, size); - call_rcu_zapped(pf); + need_callback = prepare_call_rcu_zapped(pf); lockdep_unlock(); raw_local_irq_restore(flags); - + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); /* * Wait for any possible iterators from look_up_lock_class() to pass * before continuing to free the memory they refer to. @@ -6392,6 +6401,7 @@ static void lockdep_reset_lock_reg(struct lockdep_map *lock) struct pending_free *pf; unsigned long flags; int locked; + bool need_callback = false; raw_local_irq_save(flags); locked = graph_lock(); @@ -6400,11 +6410,13 @@ static void lockdep_reset_lock_reg(struct lockdep_map *lock) pf = get_pending_free(); __lockdep_reset_lock(pf, lock); - call_rcu_zapped(pf); + need_callback = prepare_call_rcu_zapped(pf); graph_unlock(); out_irq: raw_local_irq_restore(flags); + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); } /* @@ -6448,6 +6460,7 @@ void lockdep_unregister_key(struct lock_class_key *key) struct pending_free *pf; unsigned long flags; bool found = false; + bool need_callback = false; might_sleep(); @@ -6468,11 +6481,14 @@ void lockdep_unregister_key(struct lock_class_key *key) if (found) { pf = get_pending_free(); __lockdep_free_key_range(pf, key, 1); - call_rcu_zapped(pf); + need_callback = prepare_call_rcu_zapped(pf); } lockdep_unlock(); raw_local_irq_restore(flags); + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); + /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ synchronize_rcu(); } From 39dea484e2bb9066abbc01e2c5e03b6917b0b775 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Mon, 15 Jul 2024 10:42:17 +0200 Subject: [PATCH 007/352] locking/lockdep: Simplify character output in seq_line() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Single characters should be put into a sequence. Thus use the corresponding function “seq_putc” for one selected call. This issue was transformed by using the Coccinelle software. Suggested-by: Christophe Jaillet Signed-off-by: Markus Elfring Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/e346d688-7b01-462f-867c-ba52b7790d19@web.de --- kernel/locking/lockdep_proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index e2bfb1db589d8..6db0f43fc4df5 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -424,7 +424,7 @@ static void seq_line(struct seq_file *m, char c, int offset, int length) for (i = 0; i < offset; i++) seq_puts(m, " "); for (i = 0; i < length; i++) - seq_printf(m, "%c", c); + seq_putc(m, c); seq_puts(m, "\n"); } From aa8d1f48d353b0469bff357183ee9df137d15ef0 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Wed, 3 Jul 2024 10:10:43 +0800 Subject: [PATCH 008/352] KVM: x86/mmu: Introduce a quirk to control memslot zap behavior Introduce the quirk KVM_X86_QUIRK_SLOT_ZAP_ALL to allow users to select KVM's behavior when a memslot is moved or deleted for KVM_X86_DEFAULT_VM VMs. Make sure KVM behave as if the quirk is always disabled for non-KVM_X86_DEFAULT_VM VMs. The KVM_X86_QUIRK_SLOT_ZAP_ALL quirk offers two behavior options: - when enabled: Invalidate/zap all SPTEs ("zap-all"), - when disabled: Precisely zap only the leaf SPTEs within the range of the moving/deleting memory slot ("zap-slot-leafs-only"). "zap-all" is today's KVM behavior to work around a bug [1] where the changing the zapping behavior of memslot move/deletion would cause VM instability for VMs with an Nvidia GPU assigned; while "zap-slot-leafs-only" allows for more precise zapping of SPTEs within the memory slot range, improving performance in certain scenarios [2], and meeting the functional requirements for TDX. Previous attempts to select "zap-slot-leafs-only" include a per-VM capability approach [3] (which was not preferred because the root cause of the bug remained unidentified) and a per-memslot flag approach [4]. Sean and Paolo finally recommended the implementation of this quirk and explained that it's the least bad option [5]. By default, the quirk is enabled on KVM_X86_DEFAULT_VM VMs to use "zap-all". Users have the option to disable the quirk to select "zap-slot-leafs-only" for specific KVM_X86_DEFAULT_VM VMs that are unaffected by this bug. For non-KVM_X86_DEFAULT_VM VMs, the "zap-slot-leafs-only" behavior is always selected without user's opt-in, regardless of if the user opts for "zap-all". This is because it is assumed until proven otherwise that non- KVM_X86_DEFAULT_VM VMs will not be exposed to the bug [1], and most importantly, it's because TDX must have "zap-slot-leafs-only" always selected. In TDX's case a memslot's GPA range can be a mixture of "private" or "shared" memory. Shared is roughly analogous to how EPT is handled for normal VMs, but private GPAs need lots of special treatment: 1) "zap-all" would require to zap private root page or non-leaf entries or at least leaf-entries beyond the deleting memslot scope. However, TDX demands that the root page of the private page table remains unchanged, with leaf entries being zapped before non-leaf entries, and any dropped private guest pages must be re-accepted by the guest. 2) if "zap-all" zaps only shared page tables, it would result in private pages still being mapped when the memslot is gone. This may affect even other processes if later the gmem fd was whole punched, causing the pages being freed on the host while still mapped in the TD, because there's no pgoff to the gfn information to zap the private page table after memslot is gone. So, simply go "zap-slot-leafs-only" as if the quirk is always disabled for non-KVM_X86_DEFAULT_VM VMs to avoid manual opt-in for every VM type [6] or complicating quirk disabling interface (current quirk disabling interface is limited, no way to query quirks, or force them to be disabled). Add a new function kvm_mmu_zap_memslot_leafs() to implement "zap-slot-leafs-only". This function does not call kvm_unmap_gfn_range(), bypassing special handling to APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, as 1) The APIC_ACCESS_PAGE_PRIVATE_MEMSLOT cannot be created by users, nor can it be moved. It is only deleted by KVM when APICv is permanently inhibited. 2) kvm_vcpu_reload_apic_access_page() effectively does nothing when APIC_ACCESS_PAGE_PRIVATE_MEMSLOT is deleted. 3) Avoid making all cpus request of KVM_REQ_APIC_PAGE_RELOAD can save on costly IPIs. Suggested-by: Kai Huang Suggested-by: Sean Christopherson Suggested-by: Paolo Bonzini Link: https://patchwork.kernel.org/project/kvm/patch/20190205210137.1377-11-sean.j.christopherson@intel.com [1] Link: https://patchwork.kernel.org/project/kvm/patch/20190205210137.1377-11-sean.j.christopherson@intel.com/#25054908 [2] Link: https://lore.kernel.org/kvm/20200713190649.GE29725@linux.intel.com/T/#mabc0119583dacf621025e9d873c85f4fbaa66d5c [3] Link: https://lore.kernel.org/all/20240515005952.3410568-3-rick.p.edgecombe@intel.com [4] Link: https://lore.kernel.org/all/7df9032d-83e4-46a1-ab29-6c7973a2ab0b@redhat.com [5] Link: https://lore.kernel.org/all/ZnGa550k46ow2N3L@google.com [6] Co-developed-by: Rick Edgecombe Signed-off-by: Rick Edgecombe Signed-off-by: Yan Zhao Message-ID: <20240703021043.13881-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 8 +++++++ arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/include/uapi/asm/kvm.h | 1 + arch/x86/kvm/mmu/mmu.c | 42 ++++++++++++++++++++++++++++++++- 4 files changed, 52 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index b3be87489108e..b4d1cf2e46288 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8082,6 +8082,14 @@ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS By default, KVM emulates MONITOR/MWAIT (if guest CPUID on writes to MISC_ENABLE if KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is disabled. + +KVM_X86_QUIRK_SLOT_ZAP_ALL By default, KVM invalidates all SPTEs in + fast way for memslot deletion when VM type + is KVM_X86_DEFAULT_VM. + When this quirk is disabled or when VM type + is other than KVM_X86_DEFAULT_VM, KVM zaps + only leaf SPTEs that are within the range of + the memslot being deleted. =================================== ============================================ 7.32 KVM_CAP_MAX_VCPU_ID diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4a68cb3eba78f..e4fc362ba3da7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2345,7 +2345,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_OUT_7E_INC_RIP | \ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ - KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) + KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \ + KVM_X86_QUIRK_SLOT_ZAP_ALL) /* * KVM previously used a u32 field in kvm_run to indicate the hypercall was diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index bf57a824f7228..a8debbf2f7028 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -439,6 +439,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) +#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 928cf84778b0c..f107ec2557c1e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6997,10 +6997,50 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) kvm_mmu_zap_all(kvm); } +/* + * Zapping leaf SPTEs with memslot range when a memslot is moved/deleted. + * + * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst + * case scenario we'll have unused shadow pages lying around until they + * are recycled due to age or when the VM is destroyed. + */ +static void kvm_mmu_zap_memslot_leafs(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_gfn_range range = { + .slot = slot, + .start = slot->base_gfn, + .end = slot->base_gfn + slot->npages, + .may_block = true, + }; + bool flush = false; + + write_lock(&kvm->mmu_lock); + + if (kvm_memslots_have_rmaps(kvm)) + flush = kvm_handle_gfn_range(kvm, &range, kvm_zap_rmap); + + if (tdp_mmu_enabled) + flush = kvm_tdp_mmu_unmap_gfn_range(kvm, &range, flush); + + if (flush) + kvm_flush_remote_tlbs_memslot(kvm, slot); + + write_unlock(&kvm->mmu_lock); +} + +static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm) +{ + return kvm->arch.vm_type == KVM_X86_DEFAULT_VM && + kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL); +} + void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { - kvm_mmu_zap_all_fast(kvm); + if (kvm_memslot_flush_zap_all(kvm)) + kvm_mmu_zap_all_fast(kvm); + else + kvm_mmu_zap_memslot_leafs(kvm, slot); } void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) From b4ed2c67d275b85b2ab07d54f88bebd5998d61d8 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Wed, 3 Jul 2024 10:11:19 +0800 Subject: [PATCH 009/352] KVM: selftests: Test slot move/delete with slot zap quirk enabled/disabled Update set_memory_region_test to make sure memslot move and deletion function correctly both when slot zap quirk KVM_X86_QUIRK_SLOT_ZAP_ALL is enabled and disabled. Signed-off-by: Yan Zhao Message-ID: <20240703021119.13904-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/set_memory_region_test.c | 29 ++++++++++++++----- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index bb8002084f52e..a8267628e9ed1 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -175,7 +175,7 @@ static void guest_code_move_memory_region(void) GUEST_DONE(); } -static void test_move_memory_region(void) +static void test_move_memory_region(bool disable_slot_zap_quirk) { pthread_t vcpu_thread; struct kvm_vcpu *vcpu; @@ -184,6 +184,9 @@ static void test_move_memory_region(void) vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_move_memory_region); + if (disable_slot_zap_quirk) + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); + hva = addr_gpa2hva(vm, MEM_REGION_GPA); /* @@ -266,7 +269,7 @@ static void guest_code_delete_memory_region(void) GUEST_ASSERT(0); } -static void test_delete_memory_region(void) +static void test_delete_memory_region(bool disable_slot_zap_quirk) { pthread_t vcpu_thread; struct kvm_vcpu *vcpu; @@ -276,6 +279,9 @@ static void test_delete_memory_region(void) vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_delete_memory_region); + if (disable_slot_zap_quirk) + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); + /* Delete the memory region, the guest should not die. */ vm_mem_region_delete(vm, MEM_REGION_SLOT); wait_for_vcpu(); @@ -553,7 +559,10 @@ int main(int argc, char *argv[]) { #ifdef __x86_64__ int i, loops; + int j, disable_slot_zap_quirk = 0; + if (kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_SLOT_ZAP_ALL) + disable_slot_zap_quirk = 1; /* * FIXME: the zero-memslot test fails on aarch64 and s390x because * KVM_RUN fails with ENOEXEC or EFAULT. @@ -579,13 +588,17 @@ int main(int argc, char *argv[]) else loops = 10; - pr_info("Testing MOVE of in-use region, %d loops\n", loops); - for (i = 0; i < loops; i++) - test_move_memory_region(); + for (j = 0; j <= disable_slot_zap_quirk; j++) { + pr_info("Testing MOVE of in-use region, %d loops, slot zap quirk %s\n", + loops, j ? "disabled" : "enabled"); + for (i = 0; i < loops; i++) + test_move_memory_region(!!j); - pr_info("Testing DELETE of in-use region, %d loops\n", loops); - for (i = 0; i < loops; i++) - test_delete_memory_region(); + pr_info("Testing DELETE of in-use region, %d loops, slot zap quirk %s\n", + loops, j ? "disabled" : "enabled"); + for (i = 0; i < loops; i++) + test_delete_memory_region(!!j); + } #endif return 0; From 218f6415004a881d116e254eeb837358aced55ab Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Wed, 3 Jul 2024 10:12:06 +0800 Subject: [PATCH 010/352] KVM: selftests: Allow slot modification stress test with quirk disabled Add a new user option to memslot_modification_stress_test to allow testing with slot zap quirk KVM_X86_QUIRK_SLOT_ZAP_ALL disabled. Signed-off-by: Yan Zhao Message-ID: <20240703021206.13923-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- .../kvm/memslot_modification_stress_test.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 49f162573126f..e3343f0df9e1f 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -79,6 +79,7 @@ struct test_params { useconds_t delay; uint64_t nr_iterations; bool partition_vcpu_memory_access; + bool disable_slot_zap_quirk; }; static void run_test(enum vm_guest_mode mode, void *arg) @@ -89,6 +90,13 @@ static void run_test(enum vm_guest_mode mode, void *arg) vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, VM_MEM_SRC_ANONYMOUS, p->partition_vcpu_memory_access); +#ifdef __x86_64__ + if (p->disable_slot_zap_quirk) + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); + + pr_info("Memslot zap quirk %s\n", p->disable_slot_zap_quirk ? + "disabled" : "enabled"); +#endif pr_info("Finished creating vCPUs\n"); @@ -107,11 +115,12 @@ static void run_test(enum vm_guest_mode mode, void *arg) static void help(char *name) { puts(""); - printf("usage: %s [-h] [-m mode] [-d delay_usec]\n" + printf("usage: %s [-h] [-m mode] [-d delay_usec] [-q]\n" " [-b memory] [-v vcpus] [-o] [-i iterations]\n", name); guest_modes_help(); printf(" -d: add a delay between each iteration of adding and\n" " deleting a memslot in usec.\n"); + printf(" -q: Disable memslot zap quirk.\n"); printf(" -b: specify the size of the memory region which should be\n" " accessed by each vCPU. e.g. 10M or 3G.\n" " Default: 1G\n"); @@ -137,7 +146,7 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hm:d:b:v:oi:")) != -1) { + while ((opt = getopt(argc, argv, "hm:d:qb:v:oi:")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); @@ -160,6 +169,12 @@ int main(int argc, char *argv[]) case 'i': p.nr_iterations = atoi_positive("Number of iterations", optarg); break; + case 'q': + p.disable_slot_zap_quirk = true; + + TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & + KVM_X86_QUIRK_SLOT_ZAP_ALL); + break; case 'h': default: help(argv[0]); From 61de4c34b51c5b9c7ef8229eb246346254638446 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Wed, 3 Jul 2024 10:12:19 +0800 Subject: [PATCH 011/352] KVM: selftests: Test memslot move in memslot_perf_test with quirk disabled Add a new user option to memslot_perf_test to allow testing memslot move with quirk KVM_X86_QUIRK_SLOT_ZAP_ALL disabled. Signed-off-by: Yan Zhao Message-ID: <20240703021219.13939-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/memslot_perf_test.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 579a64f97333b..893366982f77b 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -113,6 +113,7 @@ static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "atomic bool is not lockless"); static sem_t vcpu_ready; static bool map_unmap_verify; +static bool disable_slot_zap_quirk; static bool verbose; #define pr_info_v(...) \ @@ -578,6 +579,9 @@ static bool test_memslot_move_prepare(struct vm_data *data, uint32_t guest_page_size = data->vm->page_size; uint64_t movesrcgpa, movetestgpa; + if (disable_slot_zap_quirk) + vm_enable_cap(data->vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); + movesrcgpa = vm_slot2gpa(data, data->nslots - 1); if (isactive) { @@ -896,6 +900,7 @@ static void help(char *name, struct test_args *targs) pr_info(" -h: print this help screen.\n"); pr_info(" -v: enable verbose mode (not for benchmarking).\n"); pr_info(" -d: enable extra debug checks.\n"); + pr_info(" -q: Disable memslot zap quirk during memslot move.\n"); pr_info(" -s: specify memslot count cap (-1 means no cap; currently: %i)\n", targs->nslots); pr_info(" -f: specify the first test to run (currently: %i; max %zu)\n", @@ -954,7 +959,7 @@ static bool parse_args(int argc, char *argv[], uint32_t max_mem_slots; int opt; - while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) { + while ((opt = getopt(argc, argv, "hvdqs:f:e:l:r:")) != -1) { switch (opt) { case 'h': default: @@ -966,6 +971,11 @@ static bool parse_args(int argc, char *argv[], case 'd': map_unmap_verify = true; break; + case 'q': + disable_slot_zap_quirk = true; + TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & + KVM_X86_QUIRK_SLOT_ZAP_ALL); + break; case 's': targs->nslots = atoi_paranoid(optarg); if (targs->nslots <= 1 && targs->nslots != -1) { From 09c38ad044e61f4ef3b321669361b161d5a5242f Mon Sep 17 00:00:00 2001 From: Hariharan Mari Date: Thu, 4 Jul 2024 13:14:00 +0200 Subject: [PATCH 012/352] KVM: s390: Fix SORTL and DFLTCC instruction format error in __insn32_query The __insn32_query() function incorrectly uses the RRF instruction format for both the SORTL (RRE format) and DFLTCC (RRF format) instructions. To fix this issue, add separate query functions for SORTL and DFLTCC that use the appropriate instruction formats. Additionally pass the query operand as a pointer to the entire array of 32 elements to slightly optimize performance and readability. Fixes: d668139718a9 ("KVM: s390: provide query function for instructions returning 32 byte") Suggested-by: Heiko Carstens Reviewed-by: Juergen Christ Signed-off-by: Hariharan Mari Signed-off-by: Janosch Frank --- arch/s390/kvm/kvm-s390.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0fd96860fc458..bb7134faaebff 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -348,20 +348,29 @@ static inline int plo_test_bit(unsigned char nr) return cc == 0; } -static __always_inline void __insn32_query(unsigned int opcode, u8 *query) +static __always_inline void __sortl_query(u8 (*query)[32]) { asm volatile( " lghi 0,0\n" - " lgr 1,%[query]\n" + " la 1,%[query]\n" /* Parameter registers are ignored */ - " .insn rrf,%[opc] << 16,2,4,6,0\n" + " .insn rre,0xb9380000,2,4\n" + : [query] "=R" (*query) : - : [query] "d" ((unsigned long)query), [opc] "i" (opcode) - : "cc", "memory", "0", "1"); + : "cc", "0", "1"); } -#define INSN_SORTL 0xb938 -#define INSN_DFLTCC 0xb939 +static __always_inline void __dfltcc_query(u8 (*query)[32]) +{ + asm volatile( + " lghi 0,0\n" + " la 1,%[query]\n" + /* Parameter registers are ignored */ + " .insn rrf,0xb9390000,2,4,6,0\n" + : [query] "=R" (*query) + : + : "cc", "0", "1"); +} static void __init kvm_s390_cpu_feat_init(void) { @@ -415,10 +424,10 @@ static void __init kvm_s390_cpu_feat_init(void) kvm_s390_available_subfunc.kdsa); if (test_facility(150)) /* SORTL */ - __insn32_query(INSN_SORTL, kvm_s390_available_subfunc.sortl); + __sortl_query(&kvm_s390_available_subfunc.sortl); if (test_facility(151)) /* DFLTCC */ - __insn32_query(INSN_DFLTCC, kvm_s390_available_subfunc.dfltcc); + __dfltcc_query(&kvm_s390_available_subfunc.dfltcc); if (MACHINE_HAS_ESOP) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP); From 252b6fd2e186b793b960fa28ffccb07ccc4d5f51 Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Wed, 7 Aug 2024 17:45:03 +0200 Subject: [PATCH 013/352] selftests: kvm: s390: Define page sizes in shared header Multiple test cases need page size and shift definitions. By moving the definitions to a single architecture specific header we limit the repetition. Make use of PAGE_SIZE, PAGE_SHIFT and PAGE_MASK defines in existing code. Signed-off-by: Christoph Schlameuss Reviewed-by: Claudio Imbrenda Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20240807154512.316936-2-schlameuss@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20240807154512.316936-2-schlameuss@linux.ibm.com> --- tools/testing/selftests/kvm/include/s390x/processor.h | 5 +++++ tools/testing/selftests/kvm/lib/s390x/processor.c | 10 +++++----- tools/testing/selftests/kvm/s390x/cmma_test.c | 7 ++++--- tools/testing/selftests/kvm/s390x/memop.c | 4 +--- tools/testing/selftests/kvm/s390x/tprot.c | 5 ++--- 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/kvm/include/s390x/processor.h b/tools/testing/selftests/kvm/include/s390x/processor.h index 255c9b990f4cd..481bd2fd6a326 100644 --- a/tools/testing/selftests/kvm/include/s390x/processor.h +++ b/tools/testing/selftests/kvm/include/s390x/processor.h @@ -21,6 +21,11 @@ #define PAGE_PROTECT 0x200 /* HW read-only bit */ #define PAGE_NOEXEC 0x100 /* HW no-execute bit */ +/* Page size definitions */ +#define PAGE_SHIFT 12 +#define PAGE_SIZE BIT_ULL(PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE - 1)) + /* Is there a portable way to do this? */ static inline void cpu_relax(void) { diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index 4ad4492eea1d9..20cfe970e3e34 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -14,7 +14,7 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) { vm_paddr_t paddr; - TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", + TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", vm->page_size); if (vm->pgd_created) @@ -79,7 +79,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa) } /* Fill in page table entry */ - idx = (gva >> 12) & 0x0ffu; /* page index */ + idx = (gva >> PAGE_SHIFT) & 0x0ffu; /* page index */ if (!(entry[idx] & PAGE_INVALID)) fprintf(stderr, "WARNING: PTE for gpa=0x%"PRIx64" already set!\n", gpa); @@ -91,7 +91,7 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) int ri, idx; uint64_t *entry; - TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", + TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", vm->page_size); entry = addr_gpa2hva(vm, vm->pgd); @@ -103,7 +103,7 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN); } - idx = (gva >> 12) & 0x0ffu; /* page index */ + idx = (gva >> PAGE_SHIFT) & 0x0ffu; /* page index */ TEST_ASSERT(!(entry[idx] & PAGE_INVALID), "No page mapping for vm virtual address 0x%lx", gva); @@ -168,7 +168,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) struct kvm_sregs sregs; struct kvm_vcpu *vcpu; - TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", + TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", vm->page_size); stack_vaddr = __vm_vaddr_alloc(vm, stack_size, diff --git a/tools/testing/selftests/kvm/s390x/cmma_test.c b/tools/testing/selftests/kvm/s390x/cmma_test.c index b390338447568..e32dd59703a08 100644 --- a/tools/testing/selftests/kvm/s390x/cmma_test.c +++ b/tools/testing/selftests/kvm/s390x/cmma_test.c @@ -17,16 +17,17 @@ #include "kvm_util.h" #include "kselftest.h" #include "ucall_common.h" +#include "processor.h" #define MAIN_PAGE_COUNT 512 #define TEST_DATA_PAGE_COUNT 512 #define TEST_DATA_MEMSLOT 1 -#define TEST_DATA_START_GFN 4096 +#define TEST_DATA_START_GFN PAGE_SIZE #define TEST_DATA_TWO_PAGE_COUNT 256 #define TEST_DATA_TWO_MEMSLOT 2 -#define TEST_DATA_TWO_START_GFN 8192 +#define TEST_DATA_TWO_START_GFN (2 * PAGE_SIZE) static char cmma_value_buf[MAIN_PAGE_COUNT + TEST_DATA_PAGE_COUNT]; @@ -66,7 +67,7 @@ static void guest_dirty_test_data(void) " lghi 5,%[page_count]\n" /* r5 += r1 */ "2: agfr 5,1\n" - /* r2 = r1 << 12 */ + /* r2 = r1 << PAGE_SHIFT */ "1: sllg 2,1,12(0)\n" /* essa(r4, r2, SET_STABLE) */ " .insn rrf,0xb9ab0000,4,2,1,0\n" diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c index f2df7416be847..4374b4cd2a807 100644 --- a/tools/testing/selftests/kvm/s390x/memop.c +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -16,6 +16,7 @@ #include "kvm_util.h" #include "kselftest.h" #include "ucall_common.h" +#include "processor.h" enum mop_target { LOGICAL, @@ -226,9 +227,6 @@ static void memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo, #define CHECK_N_DO(f, ...) ({ f(__VA_ARGS__, CHECK_ONLY); f(__VA_ARGS__); }) -#define PAGE_SHIFT 12 -#define PAGE_SIZE (1ULL << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE - 1)) #define CR0_FETCH_PROTECTION_OVERRIDE (1UL << (63 - 38)) #define CR0_STORAGE_PROTECTION_OVERRIDE (1UL << (63 - 39)) diff --git a/tools/testing/selftests/kvm/s390x/tprot.c b/tools/testing/selftests/kvm/s390x/tprot.c index 7a742a673b7c9..12d5e1cb62e34 100644 --- a/tools/testing/selftests/kvm/s390x/tprot.c +++ b/tools/testing/selftests/kvm/s390x/tprot.c @@ -9,9 +9,8 @@ #include "kvm_util.h" #include "kselftest.h" #include "ucall_common.h" +#include "processor.h" -#define PAGE_SHIFT 12 -#define PAGE_SIZE (1 << PAGE_SHIFT) #define CR0_FETCH_PROTECTION_OVERRIDE (1UL << (63 - 38)) #define CR0_STORAGE_PROTECTION_OVERRIDE (1UL << (63 - 39)) @@ -151,7 +150,7 @@ static enum stage perform_next_stage(int *i, bool mapped_0) * instead. * In order to skip these tests we detect this inside the guest */ - skip = tests[*i].addr < (void *)4096 && + skip = tests[*i].addr < (void *)PAGE_SIZE && tests[*i].expected != TRANSL_UNAVAIL && !mapped_0; if (!skip) { From 845482188e3890269927c47316bcbacee9f71a3f Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Wed, 7 Aug 2024 17:45:04 +0200 Subject: [PATCH 014/352] selftests: kvm: s390: Add kvm_s390_sie_block definition for userspace tests Subsequent tests do require direct manipulation of the SIE control block. This commit introduces the SIE control block definition for use within the selftests. There are already definitions of this within the kernel. This differs in two ways. * This is the first definition of this in userspace. * In the context of the selftests this does not require atomicity for the flags. With the userspace definition of the SIE block layout now being present we can reuse the values in other tests where applicable. Signed-off-by: Christoph Schlameuss Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20240807154512.316936-3-schlameuss@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20240807154512.316936-3-schlameuss@linux.ibm.com> --- .../testing/selftests/kvm/include/s390x/sie.h | 240 ++++++++++++++++++ .../testing/selftests/kvm/s390x/debug_test.c | 4 +- 2 files changed, 242 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/kvm/include/s390x/sie.h diff --git a/tools/testing/selftests/kvm/include/s390x/sie.h b/tools/testing/selftests/kvm/include/s390x/sie.h new file mode 100644 index 0000000000000..160acd4a1db92 --- /dev/null +++ b/tools/testing/selftests/kvm/include/s390x/sie.h @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Definition for kernel virtual machines on s390. + * + * Adapted copy of struct definition kvm_s390_sie_block from + * arch/s390/include/asm/kvm_host.h for use in userspace selftest programs. + * + * Copyright IBM Corp. 2008, 2024 + * + * Authors: + * Christoph Schlameuss + * Carsten Otte + */ + +#ifndef SELFTEST_KVM_SIE_H +#define SELFTEST_KVM_SIE_H + +#include + +struct kvm_s390_sie_block { +#define CPUSTAT_STOPPED 0x80000000 +#define CPUSTAT_WAIT 0x10000000 +#define CPUSTAT_ECALL_PEND 0x08000000 +#define CPUSTAT_STOP_INT 0x04000000 +#define CPUSTAT_IO_INT 0x02000000 +#define CPUSTAT_EXT_INT 0x01000000 +#define CPUSTAT_RUNNING 0x00800000 +#define CPUSTAT_RETAINED 0x00400000 +#define CPUSTAT_TIMING_SUB 0x00020000 +#define CPUSTAT_SIE_SUB 0x00010000 +#define CPUSTAT_RRF 0x00008000 +#define CPUSTAT_SLSV 0x00004000 +#define CPUSTAT_SLSR 0x00002000 +#define CPUSTAT_ZARCH 0x00000800 +#define CPUSTAT_MCDS 0x00000100 +#define CPUSTAT_KSS 0x00000200 +#define CPUSTAT_SM 0x00000080 +#define CPUSTAT_IBS 0x00000040 +#define CPUSTAT_GED2 0x00000010 +#define CPUSTAT_G 0x00000008 +#define CPUSTAT_GED 0x00000004 +#define CPUSTAT_J 0x00000002 +#define CPUSTAT_P 0x00000001 + __u32 cpuflags; /* 0x0000 */ + __u32: 1; /* 0x0004 */ + __u32 prefix : 18; + __u32: 1; + __u32 ibc : 12; + __u8 reserved08[4]; /* 0x0008 */ +#define PROG_IN_SIE BIT(0) + __u32 prog0c; /* 0x000c */ + union { + __u8 reserved10[16]; /* 0x0010 */ + struct { + __u64 pv_handle_cpu; + __u64 pv_handle_config; + }; + }; +#define PROG_BLOCK_SIE BIT(0) +#define PROG_REQUEST BIT(1) + __u32 prog20; /* 0x0020 */ + __u8 reserved24[4]; /* 0x0024 */ + __u64 cputm; /* 0x0028 */ + __u64 ckc; /* 0x0030 */ + __u64 epoch; /* 0x0038 */ + __u32 svcc; /* 0x0040 */ +#define LCTL_CR0 0x8000 +#define LCTL_CR6 0x0200 +#define LCTL_CR9 0x0040 +#define LCTL_CR10 0x0020 +#define LCTL_CR11 0x0010 +#define LCTL_CR14 0x0002 + __u16 lctl; /* 0x0044 */ + __s16 icpua; /* 0x0046 */ +#define ICTL_OPEREXC 0x80000000 +#define ICTL_PINT 0x20000000 +#define ICTL_LPSW 0x00400000 +#define ICTL_STCTL 0x00040000 +#define ICTL_ISKE 0x00004000 +#define ICTL_SSKE 0x00002000 +#define ICTL_RRBE 0x00001000 +#define ICTL_TPROT 0x00000200 + __u32 ictl; /* 0x0048 */ +#define ECA_CEI 0x80000000 +#define ECA_IB 0x40000000 +#define ECA_SIGPI 0x10000000 +#define ECA_MVPGI 0x01000000 +#define ECA_AIV 0x00200000 +#define ECA_VX 0x00020000 +#define ECA_PROTEXCI 0x00002000 +#define ECA_APIE 0x00000008 +#define ECA_SII 0x00000001 + __u32 eca; /* 0x004c */ +#define ICPT_INST 0x04 +#define ICPT_PROGI 0x08 +#define ICPT_INSTPROGI 0x0C +#define ICPT_EXTREQ 0x10 +#define ICPT_EXTINT 0x14 +#define ICPT_IOREQ 0x18 +#define ICPT_WAIT 0x1c +#define ICPT_VALIDITY 0x20 +#define ICPT_STOP 0x28 +#define ICPT_OPEREXC 0x2C +#define ICPT_PARTEXEC 0x38 +#define ICPT_IOINST 0x40 +#define ICPT_KSS 0x5c +#define ICPT_MCHKREQ 0x60 +#define ICPT_INT_ENABLE 0x64 +#define ICPT_PV_INSTR 0x68 +#define ICPT_PV_NOTIFY 0x6c +#define ICPT_PV_PREF 0x70 + __u8 icptcode; /* 0x0050 */ + __u8 icptstatus; /* 0x0051 */ + __u16 ihcpu; /* 0x0052 */ + __u8 reserved54; /* 0x0054 */ +#define IICTL_CODE_NONE 0x00 +#define IICTL_CODE_MCHK 0x01 +#define IICTL_CODE_EXT 0x02 +#define IICTL_CODE_IO 0x03 +#define IICTL_CODE_RESTART 0x04 +#define IICTL_CODE_SPECIFICATION 0x10 +#define IICTL_CODE_OPERAND 0x11 + __u8 iictl; /* 0x0055 */ + __u16 ipa; /* 0x0056 */ + __u32 ipb; /* 0x0058 */ + __u32 scaoh; /* 0x005c */ +#define FPF_BPBC 0x20 + __u8 fpf; /* 0x0060 */ +#define ECB_GS 0x40 +#define ECB_TE 0x10 +#define ECB_SPECI 0x08 +#define ECB_SRSI 0x04 +#define ECB_HOSTPROTINT 0x02 +#define ECB_PTF 0x01 + __u8 ecb; /* 0x0061 */ +#define ECB2_CMMA 0x80 +#define ECB2_IEP 0x20 +#define ECB2_PFMFI 0x08 +#define ECB2_ESCA 0x04 +#define ECB2_ZPCI_LSI 0x02 + __u8 ecb2; /* 0x0062 */ +#define ECB3_AISI 0x20 +#define ECB3_AISII 0x10 +#define ECB3_DEA 0x08 +#define ECB3_AES 0x04 +#define ECB3_RI 0x01 + __u8 ecb3; /* 0x0063 */ +#define ESCA_SCAOL_MASK ~0x3fU + __u32 scaol; /* 0x0064 */ + __u8 sdf; /* 0x0068 */ + __u8 epdx; /* 0x0069 */ + __u8 cpnc; /* 0x006a */ + __u8 reserved6b; /* 0x006b */ + __u32 todpr; /* 0x006c */ +#define GISA_FORMAT1 0x00000001 + __u32 gd; /* 0x0070 */ + __u8 reserved74[12]; /* 0x0074 */ + __u64 mso; /* 0x0080 */ + __u64 msl; /* 0x0088 */ + __u64 psw_mask; /* 0x0090 */ + __u64 psw_addr; /* 0x0098 */ + __u64 gg14; /* 0x00a0 */ + __u64 gg15; /* 0x00a8 */ + __u8 reservedb0[8]; /* 0x00b0 */ +#define HPID_KVM 0x4 +#define HPID_VSIE 0x5 + __u8 hpid; /* 0x00b8 */ + __u8 reservedb9[7]; /* 0x00b9 */ + union { + struct { + __u32 eiparams; /* 0x00c0 */ + __u16 extcpuaddr; /* 0x00c4 */ + __u16 eic; /* 0x00c6 */ + }; + __u64 mcic; /* 0x00c0 */ + } __packed; + __u32 reservedc8; /* 0x00c8 */ + union { + struct { + __u16 pgmilc; /* 0x00cc */ + __u16 iprcc; /* 0x00ce */ + }; + __u32 edc; /* 0x00cc */ + } __packed; + union { + struct { + __u32 dxc; /* 0x00d0 */ + __u16 mcn; /* 0x00d4 */ + __u8 perc; /* 0x00d6 */ + __u8 peratmid; /* 0x00d7 */ + }; + __u64 faddr; /* 0x00d0 */ + } __packed; + __u64 peraddr; /* 0x00d8 */ + __u8 eai; /* 0x00e0 */ + __u8 peraid; /* 0x00e1 */ + __u8 oai; /* 0x00e2 */ + __u8 armid; /* 0x00e3 */ + __u8 reservede4[4]; /* 0x00e4 */ + union { + __u64 tecmc; /* 0x00e8 */ + struct { + __u16 subchannel_id; /* 0x00e8 */ + __u16 subchannel_nr; /* 0x00ea */ + __u32 io_int_parm; /* 0x00ec */ + __u32 io_int_word; /* 0x00f0 */ + }; + } __packed; + __u8 reservedf4[8]; /* 0x00f4 */ +#define CRYCB_FORMAT_MASK 0x00000003 +#define CRYCB_FORMAT0 0x00000000 +#define CRYCB_FORMAT1 0x00000001 +#define CRYCB_FORMAT2 0x00000003 + __u32 crycbd; /* 0x00fc */ + __u64 gcr[16]; /* 0x0100 */ + union { + __u64 gbea; /* 0x0180 */ + __u64 sidad; + }; + __u8 reserved188[8]; /* 0x0188 */ + __u64 sdnxo; /* 0x0190 */ + __u8 reserved198[8]; /* 0x0198 */ + __u32 fac; /* 0x01a0 */ + __u8 reserved1a4[20]; /* 0x01a4 */ + __u64 cbrlo; /* 0x01b8 */ + __u8 reserved1c0[8]; /* 0x01c0 */ +#define ECD_HOSTREGMGMT 0x20000000 +#define ECD_MEF 0x08000000 +#define ECD_ETOKENF 0x02000000 +#define ECD_ECC 0x00200000 + __u32 ecd; /* 0x01c8 */ + __u8 reserved1cc[18]; /* 0x01cc */ + __u64 pp; /* 0x01de */ + __u8 reserved1e6[2]; /* 0x01e6 */ + __u64 itdba; /* 0x01e8 */ + __u64 riccbd; /* 0x01f0 */ + __u64 gvrd; /* 0x01f8 */ +} __packed __aligned(512); + +#endif /* SELFTEST_KVM_SIE_H */ diff --git a/tools/testing/selftests/kvm/s390x/debug_test.c b/tools/testing/selftests/kvm/s390x/debug_test.c index 84313fb275291..ad80959686014 100644 --- a/tools/testing/selftests/kvm/s390x/debug_test.c +++ b/tools/testing/selftests/kvm/s390x/debug_test.c @@ -2,12 +2,12 @@ /* Test KVM debugging features. */ #include "kvm_util.h" #include "test_util.h" +#include "sie.h" #include #define __LC_SVC_NEW_PSW 0x1c0 #define __LC_PGM_NEW_PSW 0x1d0 -#define ICPT_INSTRUCTION 0x04 #define IPA0_DIAG 0x8300 #define PGM_SPECIFICATION 0x06 @@ -85,7 +85,7 @@ static void test_step_pgm_diag(void) vm = test_step_int_1(&vcpu, test_step_pgm_diag_guest_code, __LC_PGM_NEW_PSW, new_psw); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); - TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_INSTRUCTION); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_INST); TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa & 0xff00, IPA0_DIAG); vcpu_ioctl(vcpu, KVM_S390_IRQ, &irq); vcpu_run(vcpu); From 011901fc222435f5bf2d6accb392f749ddebdfe7 Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Wed, 7 Aug 2024 17:45:05 +0200 Subject: [PATCH 015/352] selftests: kvm: s390: Add s390x ucontrol test suite with hpage test Add test suite to validate the s390x architecture specific ucontrol KVM interface. Make use of the selftest test harness. * uc_cap_hpage testcase verifies that a ucontrol VM cannot be run with hugepages. To allow testing of the ucontrol interface the kernel needs a non-default config containing CONFIG_KVM_S390_UCONTROL. This config needs to be set to built-in (y) as this cannot be built as module. Signed-off-by: Christoph Schlameuss Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20240807154512.316936-4-schlameuss@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20240807154512.316936-4-schlameuss@linux.ibm.com> --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + tools/testing/selftests/kvm/s390x/config | 2 + .../selftests/kvm/s390x/ucontrol_test.c | 76 +++++++++++++++++++ 4 files changed, 80 insertions(+) create mode 100644 tools/testing/selftests/kvm/s390x/config create mode 100644 tools/testing/selftests/kvm/s390x/ucontrol_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 6d9381d60172f..f2a30a58cd718 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -5,3 +5,4 @@ !*.h !*.S !*.sh +!config diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 48d32c5aa3eb7..b3dc0a5bf0d4c 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -186,6 +186,7 @@ TEST_GEN_PROGS_s390x += s390x/tprot TEST_GEN_PROGS_s390x += s390x/cmma_test TEST_GEN_PROGS_s390x += s390x/debug_test TEST_GEN_PROGS_s390x += s390x/shared_zeropage_test +TEST_GEN_PROGS_s390x += s390x/ucontrol_test TEST_GEN_PROGS_s390x += demand_paging_test TEST_GEN_PROGS_s390x += dirty_log_test TEST_GEN_PROGS_s390x += guest_print_test diff --git a/tools/testing/selftests/kvm/s390x/config b/tools/testing/selftests/kvm/s390x/config new file mode 100644 index 0000000000000..23270f2d679f0 --- /dev/null +++ b/tools/testing/selftests/kvm/s390x/config @@ -0,0 +1,2 @@ +CONFIG_KVM=y +CONFIG_KVM_S390_UCONTROL=y diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c new file mode 100644 index 0000000000000..cd68e7e37d358 --- /dev/null +++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test code for the s390x kvm ucontrol interface + * + * Copyright IBM Corp. 2024 + * + * Authors: + * Christoph Schlameuss + */ +#include "kselftest_harness.h" +#include "kvm_util.h" + +#include +#include + +/* so directly declare capget to check caps without libcap */ +int capget(cap_user_header_t header, cap_user_data_t data); + +/** + * In order to create user controlled virtual machines on S390, + * check KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL + * as privileged user (SYS_ADMIN). + */ +void require_ucontrol_admin(void) +{ + struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3]; + struct __user_cap_header_struct hdr = { + .version = _LINUX_CAPABILITY_VERSION_3, + }; + int rc; + + rc = capget(&hdr, data); + TEST_ASSERT_EQ(0, rc); + TEST_REQUIRE((data->effective & CAP_TO_MASK(CAP_SYS_ADMIN)) > 0); + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_UCONTROL)); +} + +/** + * Assert HPAGE CAP cannot be enabled on UCONTROL VM + */ +TEST(uc_cap_hpage) +{ + int rc, kvm_fd, vm_fd, vcpu_fd; + struct kvm_enable_cap cap = { + .cap = KVM_CAP_S390_HPAGE_1M, + }; + + require_ucontrol_admin(); + + kvm_fd = open_kvm_dev_path_or_exit(); + vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, KVM_VM_S390_UCONTROL); + ASSERT_GE(vm_fd, 0); + + /* assert hpages are not supported on ucontrol vm */ + rc = ioctl(vm_fd, KVM_CHECK_EXTENSION, KVM_CAP_S390_HPAGE_1M); + EXPECT_EQ(0, rc); + + /* Test that KVM_CAP_S390_HPAGE_1M can't be enabled for a ucontrol vm */ + rc = ioctl(vm_fd, KVM_ENABLE_CAP, cap); + EXPECT_EQ(-1, rc); + EXPECT_EQ(EINVAL, errno); + + /* assert HPAGE CAP is rejected after vCPU creation */ + vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0); + ASSERT_GE(vcpu_fd, 0); + rc = ioctl(vm_fd, KVM_ENABLE_CAP, cap); + EXPECT_EQ(-1, rc); + EXPECT_EQ(EBUSY, errno); + + close(vcpu_fd); + close(vm_fd); + close(kvm_fd); +} + +TEST_HARNESS_MAIN From d4f8592f6c42a6df7f570be6c4ccf8bd40838aab Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Wed, 7 Aug 2024 17:45:06 +0200 Subject: [PATCH 016/352] selftests: kvm: s390: Add test fixture and simple VM setup tests Add a uc_kvm fixture to create and destroy a ucontrol VM. * uc_sie_assertions asserts basic settings in the SIE as setup by the kernel. * uc_attr_mem_limit asserts the memory limit is max value and cannot be set (not supported). * uc_no_dirty_log asserts dirty log is not supported. Signed-off-by: Christoph Schlameuss Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20240807154512.316936-5-schlameuss@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20240807154512.316936-5-schlameuss@linux.ibm.com> --- .../selftests/kvm/s390x/ucontrol_test.c | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c index cd68e7e37d358..d103a92e7495b 100644 --- a/tools/testing/selftests/kvm/s390x/ucontrol_test.c +++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c @@ -9,10 +9,14 @@ */ #include "kselftest_harness.h" #include "kvm_util.h" +#include "processor.h" +#include "sie.h" #include #include +#define VM_MEM_SIZE (4 * SZ_1M) + /* so directly declare capget to check caps without libcap */ int capget(cap_user_header_t header, cap_user_data_t data); @@ -36,6 +40,133 @@ void require_ucontrol_admin(void) TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_UCONTROL)); } +FIXTURE(uc_kvm) +{ + struct kvm_s390_sie_block *sie_block; + struct kvm_run *run; + uintptr_t base_gpa; + uintptr_t code_gpa; + uintptr_t base_hva; + uintptr_t code_hva; + int kvm_run_size; + void *vm_mem; + int vcpu_fd; + int kvm_fd; + int vm_fd; +}; + +/** + * create VM with single vcpu, map kvm_run and SIE control block for easy access + */ +FIXTURE_SETUP(uc_kvm) +{ + struct kvm_s390_vm_cpu_processor info; + int rc; + + require_ucontrol_admin(); + + self->kvm_fd = open_kvm_dev_path_or_exit(); + self->vm_fd = ioctl(self->kvm_fd, KVM_CREATE_VM, KVM_VM_S390_UCONTROL); + ASSERT_GE(self->vm_fd, 0); + + kvm_device_attr_get(self->vm_fd, KVM_S390_VM_CPU_MODEL, + KVM_S390_VM_CPU_PROCESSOR, &info); + TH_LOG("create VM 0x%llx", info.cpuid); + + self->vcpu_fd = ioctl(self->vm_fd, KVM_CREATE_VCPU, 0); + ASSERT_GE(self->vcpu_fd, 0); + + self->kvm_run_size = ioctl(self->kvm_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); + ASSERT_GE(self->kvm_run_size, sizeof(struct kvm_run)) + TH_LOG(KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, self->kvm_run_size)); + self->run = (struct kvm_run *)mmap(NULL, self->kvm_run_size, + PROT_READ | PROT_WRITE, MAP_SHARED, self->vcpu_fd, 0); + ASSERT_NE(self->run, MAP_FAILED); + /** + * For virtual cpus that have been created with S390 user controlled + * virtual machines, the resulting vcpu fd can be memory mapped at page + * offset KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of + * the virtual cpu's hardware control block. + */ + self->sie_block = (struct kvm_s390_sie_block *)mmap(NULL, PAGE_SIZE, + PROT_READ | PROT_WRITE, MAP_SHARED, + self->vcpu_fd, KVM_S390_SIE_PAGE_OFFSET << PAGE_SHIFT); + ASSERT_NE(self->sie_block, MAP_FAILED); + + TH_LOG("VM created %p %p", self->run, self->sie_block); + + self->base_gpa = 0; + self->code_gpa = self->base_gpa + (3 * SZ_1M); + + self->vm_mem = aligned_alloc(SZ_1M, VM_MEM_SIZE); + ASSERT_NE(NULL, self->vm_mem) TH_LOG("malloc failed %u", errno); + self->base_hva = (uintptr_t)self->vm_mem; + self->code_hva = self->base_hva - self->base_gpa + self->code_gpa; + struct kvm_s390_ucas_mapping map = { + .user_addr = self->base_hva, + .vcpu_addr = self->base_gpa, + .length = VM_MEM_SIZE, + }; + TH_LOG("ucas map %p %p 0x%llx", + (void *)map.user_addr, (void *)map.vcpu_addr, map.length); + rc = ioctl(self->vcpu_fd, KVM_S390_UCAS_MAP, &map); + ASSERT_EQ(0, rc) TH_LOG("ucas map result %d not expected, %s", + rc, strerror(errno)); + + TH_LOG("page in %p", (void *)self->base_gpa); + rc = ioctl(self->vcpu_fd, KVM_S390_VCPU_FAULT, self->base_gpa); + ASSERT_EQ(0, rc) TH_LOG("vcpu fault (%p) result %d not expected, %s", + (void *)self->base_hva, rc, strerror(errno)); + + self->sie_block->cpuflags &= ~CPUSTAT_STOPPED; +} + +FIXTURE_TEARDOWN(uc_kvm) +{ + munmap(self->sie_block, PAGE_SIZE); + munmap(self->run, self->kvm_run_size); + close(self->vcpu_fd); + close(self->vm_fd); + close(self->kvm_fd); + free(self->vm_mem); +} + +TEST_F(uc_kvm, uc_sie_assertions) +{ + /* assert interception of Code 08 (Program Interruption) is set */ + EXPECT_EQ(0, self->sie_block->ecb & ECB_SPECI); +} + +TEST_F(uc_kvm, uc_attr_mem_limit) +{ + u64 limit; + struct kvm_device_attr attr = { + .group = KVM_S390_VM_MEM_CTRL, + .attr = KVM_S390_VM_MEM_LIMIT_SIZE, + .addr = (unsigned long)&limit, + }; + int rc; + + rc = ioctl(self->vm_fd, KVM_GET_DEVICE_ATTR, &attr); + EXPECT_EQ(0, rc); + EXPECT_EQ(~0UL, limit); + + /* assert set not supported */ + rc = ioctl(self->vm_fd, KVM_SET_DEVICE_ATTR, &attr); + EXPECT_EQ(-1, rc); + EXPECT_EQ(EINVAL, errno); +} + +TEST_F(uc_kvm, uc_no_dirty_log) +{ + struct kvm_dirty_log dlog; + int rc; + + rc = ioctl(self->vm_fd, KVM_GET_DIRTY_LOG, &dlog); + EXPECT_EQ(-1, rc); + EXPECT_EQ(EINVAL, errno); +} + /** * Assert HPAGE CAP cannot be enabled on UCONTROL VM */ From 100932fc37d468bfa80b5675b653ff65f7bafba7 Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Wed, 7 Aug 2024 17:45:07 +0200 Subject: [PATCH 017/352] selftests: kvm: s390: Add debug print functions Add functions to simply print some basic state information in selftests. The output can be enabled by setting: #define TH_LOG_ENABLED 1 #define DEBUG 1 * print_psw: current SIE state description and VM run state * print_hex_bytes: print memory with some counting markers * print_hex: PRINT_HEX with 512 bytes * print_run: use print_psw and print_hex to print contents of VM run state and SIE state description * print_regs: print content of general and control registers All prints use pr_debug for the output and can be configured using DEBUG. Signed-off-by: Christoph Schlameuss Acked-by: Janosch Frank Link: https://lore.kernel.org/r/20240807154512.316936-6-schlameuss@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20240807154512.316936-6-schlameuss@linux.ibm.com> --- .../selftests/kvm/include/s390x/debug_print.h | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 tools/testing/selftests/kvm/include/s390x/debug_print.h diff --git a/tools/testing/selftests/kvm/include/s390x/debug_print.h b/tools/testing/selftests/kvm/include/s390x/debug_print.h new file mode 100644 index 0000000000000..1bf275631cc69 --- /dev/null +++ b/tools/testing/selftests/kvm/include/s390x/debug_print.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Definition for kernel virtual machines on s390x + * + * Copyright IBM Corp. 2024 + * + * Authors: + * Christoph Schlameuss + */ + +#ifndef SELFTEST_KVM_DEBUG_PRINT_H +#define SELFTEST_KVM_DEBUG_PRINT_H + +#include "asm/ptrace.h" +#include "kvm_util.h" +#include "sie.h" + +static inline void print_hex_bytes(const char *name, u64 addr, size_t len) +{ + u64 pos; + + pr_debug("%s (%p)\n", name, (void *)addr); + pr_debug(" 0/0x00---------|"); + if (len > 8) + pr_debug(" 8/0x08---------|"); + if (len > 16) + pr_debug(" 16/0x10--------|"); + if (len > 24) + pr_debug(" 24/0x18--------|"); + for (pos = 0; pos < len; pos += 8) { + if ((pos % 32) == 0) + pr_debug("\n %3lu 0x%.3lx ", pos, pos); + pr_debug(" %16lx", *((u64 *)(addr + pos))); + } + pr_debug("\n"); +} + +static inline void print_hex(const char *name, u64 addr) +{ + print_hex_bytes(name, addr, 512); +} + +static inline void print_psw(struct kvm_run *run, struct kvm_s390_sie_block *sie_block) +{ + pr_debug("flags:0x%x psw:0x%.16llx:0x%.16llx exit:%u %s\n", + run->flags, + run->psw_mask, run->psw_addr, + run->exit_reason, exit_reason_str(run->exit_reason)); + pr_debug("sie_block psw:0x%.16llx:0x%.16llx\n", + sie_block->psw_mask, sie_block->psw_addr); +} + +static inline void print_run(struct kvm_run *run, struct kvm_s390_sie_block *sie_block) +{ + print_hex_bytes("run", (u64)run, 0x150); + print_hex("sie_block", (u64)sie_block); + print_psw(run, sie_block); +} + +static inline void print_regs(struct kvm_run *run) +{ + struct kvm_sync_regs *sync_regs = &run->s.regs; + + print_hex_bytes("GPRS", (u64)sync_regs->gprs, 8 * NUM_GPRS); + print_hex_bytes("ACRS", (u64)sync_regs->acrs, 4 * NUM_ACRS); + print_hex_bytes("CRS", (u64)sync_regs->crs, 8 * NUM_CRS); +} + +#endif /* SELFTEST_KVM_DEBUG_PRINT_H */ From c7ff693fa2094ba0a9d0a20feb4ab1658eff9c33 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Mon, 22 Jul 2024 11:06:21 +0200 Subject: [PATCH 018/352] module: Split modules_install compression and in-kernel decompression The kernel configuration allows specifying a module compression mode. If one is selected then each module gets compressed during 'make modules_install' and additionally one can also enable support for a respective direct in-kernel decompression support. This means that the decompression support cannot be enabled without the automatic compression. Some distributions, such as the (open)SUSE family, use a signer service for modules. A build runs on a worker machine but signing is done by a separate locked-down server that is in possession of the signing key. The build invokes 'make modules_install' to create a modules tree, collects information about the modules, asks the signer service for their signature, appends each signature to the respective module and compresses all modules. When using this arrangment, the 'make modules_install' step produces unsigned+uncompressed modules and the distribution's own build recipe takes care of signing and compression later. The signing support can be currently enabled without automatically signing modules during 'make modules_install'. However, the in-kernel decompression support can be selected only after first enabling automatic compression during this step. To allow only enabling the in-kernel decompression support without the automatic compression during 'make modules_install', separate the compression options similarly to the signing options, as follows: > Enable loadable module support [*] Module compression Module compression type (GZIP) ---> [*] Automatically compress all modules [ ] Support in-kernel module decompression * "Module compression" (MODULE_COMPRESS) is a new main switch for the compression/decompression support. It replaces MODULE_COMPRESS_NONE. * "Module compression type" (MODULE_COMPRESS_) chooses the compression type, one of GZ, XZ, ZSTD. * "Automatically compress all modules" (MODULE_COMPRESS_ALL) is a new option to enable module compression during 'make modules_install'. It defaults to Y. * "Support in-kernel module decompression" (MODULE_DECOMPRESS) enables in-kernel decompression. Signed-off-by: Petr Pavlu Acked-by: Masahiro Yamada Signed-off-by: Luis Chamberlain --- kernel/module/Kconfig | 61 ++++++++++++++++++++-------------------- scripts/Makefile.modinst | 2 ++ 2 files changed, 33 insertions(+), 30 deletions(-) diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 4047b6d482557..bb7f7930fef6f 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -278,64 +278,65 @@ config MODULE_SIG_HASH default "sha3-384" if MODULE_SIG_SHA3_384 default "sha3-512" if MODULE_SIG_SHA3_512 -choice - prompt "Module compression mode" +config MODULE_COMPRESS + bool "Module compression" help - This option allows you to choose the algorithm which will be used to - compress modules when 'make modules_install' is run. (or, you can - choose to not compress modules at all.) - - External modules will also be compressed in the same way during the - installation. - - For modules inside an initrd or initramfs, it's more efficient to - compress the whole initrd or initramfs instead. - + Enable module compression to reduce on-disk size of module binaries. This is fully compatible with signed modules. - Please note that the tool used to load modules needs to support the - corresponding algorithm. module-init-tools MAY support gzip, and kmod - MAY support gzip, xz and zstd. + The tool used to work with modules needs to support the selected + compression type. kmod MAY support gzip, xz and zstd. Other tools + might have a limited selection of the supported types. - Your build system needs to provide the appropriate compression tool - to compress the modules. + Note that for modules inside an initrd or initramfs, it's more + efficient to compress the whole ramdisk instead. - If in doubt, select 'None'. + If unsure, say N. -config MODULE_COMPRESS_NONE - bool "None" +choice + prompt "Module compression type" + depends on MODULE_COMPRESS help - Do not compress modules. The installed modules are suffixed - with .ko. + Choose the supported algorithm for module compression. config MODULE_COMPRESS_GZIP bool "GZIP" help - Compress modules with GZIP. The installed modules are suffixed - with .ko.gz. + Support modules compressed with GZIP. The installed modules are + suffixed with .ko.gz. config MODULE_COMPRESS_XZ bool "XZ" help - Compress modules with XZ. The installed modules are suffixed - with .ko.xz. + Support modules compressed with XZ. The installed modules are + suffixed with .ko.xz. config MODULE_COMPRESS_ZSTD bool "ZSTD" help - Compress modules with ZSTD. The installed modules are suffixed - with .ko.zst. + Support modules compressed with ZSTD. The installed modules are + suffixed with .ko.zst. endchoice +config MODULE_COMPRESS_ALL + bool "Automatically compress all modules" + default y + depends on MODULE_COMPRESS + help + Compress all modules during 'make modules_install'. + + Your build system needs to provide the appropriate compression tool + for the selected compression type. External modules will also be + compressed in the same way during the installation. + config MODULE_DECOMPRESS bool "Support in-kernel module decompression" - depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD + depends on MODULE_COMPRESS select ZLIB_INFLATE if MODULE_COMPRESS_GZIP select XZ_DEC if MODULE_COMPRESS_XZ select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD help - Support for decompressing kernel modules by the kernel itself instead of relying on userspace to perform this task. Useful when load pinning security policy is enabled. diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst index 0afd75472679f..bce4a9adb8938 100644 --- a/scripts/Makefile.modinst +++ b/scripts/Makefile.modinst @@ -51,9 +51,11 @@ $(foreach x, % :, $(if $(findstring $x, $(dst)), \ $(error module installation path cannot contain '$x'))) suffix-y := +ifdef CONFIG_MODULE_COMPRESS_ALL suffix-$(CONFIG_MODULE_COMPRESS_GZIP) := .gz suffix-$(CONFIG_MODULE_COMPRESS_XZ) := .xz suffix-$(CONFIG_MODULE_COMPRESS_ZSTD) := .zst +endif modules := $(patsubst $(extmod_prefix)%.o, $(dst)/%.ko$(suffix-y), $(modules)) install-$(CONFIG_MODULES) += $(modules) From f94ce04e54038c37bcac8ae2e4e99a81a188b777 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Mon, 22 Jul 2024 11:06:22 +0200 Subject: [PATCH 019/352] module: Clean up the description of MODULE_SIG_ The MODULE_SIG_ config choice has an inconsistent prompt styled as a question and lengthy option names. Simplify the prompt and option names to be consistent with other module options. Signed-off-by: Petr Pavlu Signed-off-by: Luis Chamberlain --- kernel/module/Kconfig | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index bb7f7930fef6f..ccdbd1bc12aa7 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -228,7 +228,7 @@ comment "Do not forget to sign required modules with scripts/sign-file" depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL choice - prompt "Which hash algorithm should modules be signed with?" + prompt "Hash algorithm to sign modules" depends on MODULE_SIG || IMA_APPRAISE_MODSIG help This determines which sort of hashing algorithm will be used during @@ -238,31 +238,31 @@ choice the signature on that module. config MODULE_SIG_SHA1 - bool "Sign modules with SHA-1" + bool "SHA-1" select CRYPTO_SHA1 config MODULE_SIG_SHA256 - bool "Sign modules with SHA-256" + bool "SHA-256" select CRYPTO_SHA256 config MODULE_SIG_SHA384 - bool "Sign modules with SHA-384" + bool "SHA-384" select CRYPTO_SHA512 config MODULE_SIG_SHA512 - bool "Sign modules with SHA-512" + bool "SHA-512" select CRYPTO_SHA512 config MODULE_SIG_SHA3_256 - bool "Sign modules with SHA3-256" + bool "SHA3-256" select CRYPTO_SHA3 config MODULE_SIG_SHA3_384 - bool "Sign modules with SHA3-384" + bool "SHA3-384" select CRYPTO_SHA3 config MODULE_SIG_SHA3_512 - bool "Sign modules with SHA3-512" + bool "SHA3-512" select CRYPTO_SHA3 endchoice From 1448d4a935abb89942d0af7864480db349433079 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 14 Aug 2024 22:33:46 +0200 Subject: [PATCH 020/352] KVM: x86: Optimize local variable in start_sw_tscdeadline() Change the data type of the local variable this_tsc_khz to u32 because virtual_tsc_khz is also declared as u32. Since do_div() casts the divisor to u32 anyway, changing the data type of this_tsc_khz to u32 also removes the following Coccinelle/coccicheck warning reported by do_div.cocci: WARNING: do_div() does a 64-by-32 division, please consider using div64_ul instead Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20240814203345.2234-2-thorsten.blum@toblux.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5bb481aefcbcd..d77cd3b87e850 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1944,7 +1944,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) u64 ns = 0; ktime_t expire; struct kvm_vcpu *vcpu = apic->vcpu; - unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; + u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz; unsigned long flags; ktime_t now; From 1c450ffef589383f743d798666703687fc2e582b Mon Sep 17 00:00:00 2001 From: Tao Su Date: Mon, 19 Aug 2024 14:23:27 +0800 Subject: [PATCH 021/352] KVM: x86: Advertise AVX10.1 CPUID to userspace Advertise AVX10.1 related CPUIDs, i.e. report AVX10 support bit via CPUID.(EAX=07H, ECX=01H):EDX[bit 19] and new CPUID leaf 0x24H so that guest OS and applications can query the AVX10.1 CPUIDs directly. Intel AVX10 represents the first major new vector ISA since the introduction of Intel AVX512, which will establish a common, converged vector instruction set across all Intel architectures[1]. AVX10.1 is an early version of AVX10, that enumerates the Intel AVX512 instruction set at 128, 256, and 512 bits which is enabled on Granite Rapids. I.e., AVX10.1 is only a new CPUID enumeration with no new functionality. New features, e.g. Embedded Rounding and Suppress All Exceptions (SAE) will be introduced in AVX10.2. Advertising AVX10.1 is safe because there is nothing to enable for AVX10.1, i.e. it's purely a new way to enumerate support, thus there will never be anything for the kernel to enable. Note just the CPUID checking is changed when using AVX512 related instructions, e.g. if using one AVX512 instruction needs to check (AVX512 AND AVX512DQ), it can check ((AVX512 AND AVX512DQ) OR AVX10.1) after checking XCR0[7:5]. The versions of AVX10 are expected to be inclusive, e.g. version N+1 is a superset of version N. Per the spec, the version can never be 0, just advertise AVX10.1 if it's supported in hardware. Moreover, advertising AVX10_{128,256,512} needs to land in the same commit as advertising basic AVX10.1 support, otherwise KVM would advertise an impossible CPU model. E.g. a CPU with AVX512 but not AVX10.1/512 is impossible per the SDM. As more and more AVX related CPUIDs are added (it would have resulted in around 40-50 CPUID flags when developing AVX10), the versioning approach is introduced. But incrementing version numbers are bad for virtualization. E.g. if AVX10.2 has a feature that shouldn't be enumerated to guests for whatever reason, then KVM can't enumerate any "later" features either, because the only way to hide the problematic AVX10.2 feature is to set the version to AVX10.1 or lower[2]. But most AVX features are just passed through and don't have virtualization controls, so AVX10 should not be problematic in practice, so long as Intel honors their promise that future versions will be supersets of past versions. [1] https://cdrdv2.intel.com/v1/dl/getContent/784267 [2] https://lore.kernel.org/all/Zkz5Ak0PQlAN8DxK@google.com/ Suggested-by: Sean Christopherson Signed-off-by: Tao Su Link: https://lore.kernel.org/r/20240819062327.3269720-1-tao1.su@linux.intel.com [sean: minor changelog tweaks] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpuid.h | 1 + arch/x86/kvm/cpuid.c | 30 ++++++++++++++++++++++++++++-- arch/x86/kvm/reverse_cpuid.h | 8 ++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index 6b122a31da065..aa21c105eef14 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -179,6 +179,7 @@ static __always_inline bool cpuid_function_is_indexed(u32 function) case 0x1d: case 0x1e: case 0x1f: + case 0x24: case 0x8000001d: return true; } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2617be544480a..41786b834b163 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -705,7 +705,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) | - F(AMX_COMPLEX) + F(AMX_COMPLEX) | F(AVX10) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX, @@ -721,6 +721,10 @@ void kvm_set_cpu_caps(void) SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA) ); + kvm_cpu_cap_init_kvm_defined(CPUID_24_0_EBX, + F(AVX10_128) | F(AVX10_256) | F(AVX10_512) + ); + kvm_cpu_cap_mask(CPUID_8000_0001_ECX, F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | @@ -949,7 +953,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) switch (function) { case 0: /* Limited to the highest leaf implemented in KVM. */ - entry->eax = min(entry->eax, 0x1fU); + entry->eax = min(entry->eax, 0x24U); break; case 1: cpuid_entry_override(entry, CPUID_1_EDX); @@ -1174,6 +1178,28 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; } break; + case 0x24: { + u8 avx10_version; + + if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + + /* + * The AVX10 version is encoded in EBX[7:0]. Note, the version + * is guaranteed to be >=1 if AVX10 is supported. Note #2, the + * version needs to be captured before overriding EBX features! + */ + avx10_version = min_t(u8, entry->ebx & 0xff, 1); + cpuid_entry_override(entry, CPUID_24_0_EBX); + entry->ebx |= avx10_version; + + entry->eax = 0; + entry->ecx = 0; + entry->edx = 0; + break; + } case KVM_CPUID_SIGNATURE: { const u32 *sigptr = (const u32 *)KVM_SIGNATURE; entry->eax = KVM_CPUID_FEATURES; diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 2f4e155080bad..0d17d6b706396 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -17,6 +17,7 @@ enum kvm_only_cpuid_leafs { CPUID_8000_0007_EDX, CPUID_8000_0022_EAX, CPUID_7_2_EDX, + CPUID_24_0_EBX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -46,6 +47,7 @@ enum kvm_only_cpuid_leafs { #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) #define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8) #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) +#define X86_FEATURE_AVX10 KVM_X86_FEATURE(CPUID_7_1_EDX, 19) /* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */ #define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0) @@ -55,6 +57,11 @@ enum kvm_only_cpuid_leafs { #define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) +/* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */ +#define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16) +#define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17) +#define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18) + /* CPUID level 0x80000007 (EDX). */ #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) @@ -90,6 +97,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, + [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, }; /* From e7e80b66fb242a63cdfc3d3534cff62a5e293185 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 5 Jun 2024 16:19:09 -0700 Subject: [PATCH 022/352] x86/cpu: KVM: Add common defines for architectural memory types (PAT, MTRRs, etc.) Add defines for the architectural memory types that can be shoved into various MSRs and registers, e.g. MTRRs, PAT, VMX capabilities MSRs, EPTPs, etc. While most MSRs/registers support only a subset of all memory types, the values themselves are architectural and identical across all users. Leave the goofy MTRR_TYPE_* definitions as-is since they are in a uapi header, but add compile-time assertions to connect the dots (and sanity check that the msr-index.h values didn't get fat-fingered). Keep the VMX_EPTP_MT_* defines so that it's slightly more obvious that the EPTP holds a single memory type in 3 of its 64 bits; those bits just happen to be 2:0, i.e. don't need to be shifted. Opportunistically use X86_MEMTYPE_WB instead of an open coded '6' in setup_vmcs_config(). No functional change intended. Reviewed-by: Thomas Gleixner Acked-by: Kai Huang Reviewed-by: Xiaoyao Li Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20240605231918.2915961-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/msr-index.h | 15 ++++++++++++++- arch/x86/include/asm/vmx.h | 5 +++-- arch/x86/kernel/cpu/mtrr/mtrr.c | 6 ++++++ arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/mm/pat/memtype.c | 33 ++++++++++++-------------------- 6 files changed, 37 insertions(+), 26 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 82c6a4d350e09..bb43a07616e64 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -36,6 +36,20 @@ #define EFER_FFXSR (1<<_EFER_FFXSR) #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) +/* + * Architectural memory types that are common to MTRRs, PAT, VMX MSRs, etc. + * Most MSRs support/allow only a subset of memory types, but the values + * themselves are common across all relevant MSRs. + */ +#define X86_MEMTYPE_UC 0ull /* Uncacheable, a.k.a. Strong Uncacheable */ +#define X86_MEMTYPE_WC 1ull /* Write Combining */ +/* RESERVED 2 */ +/* RESERVED 3 */ +#define X86_MEMTYPE_WT 4ull /* Write Through */ +#define X86_MEMTYPE_WP 5ull /* Write Protected */ +#define X86_MEMTYPE_WB 6ull /* Write Back */ +#define X86_MEMTYPE_UC_MINUS 7ull /* Weak Uncacheabled (PAT only) */ + /* FRED MSRs */ #define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */ #define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */ @@ -1163,7 +1177,6 @@ #define VMX_BASIC_64 0x0001000000000000LLU #define VMX_BASIC_MEM_TYPE_SHIFT 50 #define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU -#define VMX_BASIC_MEM_TYPE_WB 6LLU #define VMX_BASIC_INOUT 0x0040000000000000LLU /* Resctrl MSRs: */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index d77a31039f241..e531d8d80a112 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -508,9 +508,10 @@ enum vmcs_field { #define VMX_EPTP_PWL_4 0x18ull #define VMX_EPTP_PWL_5 0x20ull #define VMX_EPTP_AD_ENABLE_BIT (1ull << 6) +/* The EPTP memtype is encoded in bits 2:0, i.e. doesn't need to be shifted. */ #define VMX_EPTP_MT_MASK 0x7ull -#define VMX_EPTP_MT_WB 0x6ull -#define VMX_EPTP_MT_UC 0x0ull +#define VMX_EPTP_MT_WB X86_MEMTYPE_WB +#define VMX_EPTP_MT_UC X86_MEMTYPE_UC #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 2a2fc14955cd3..989d368be04fc 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -55,6 +55,12 @@ #include "mtrr.h" +static_assert(X86_MEMTYPE_UC == MTRR_TYPE_UNCACHABLE); +static_assert(X86_MEMTYPE_WC == MTRR_TYPE_WRCOMB); +static_assert(X86_MEMTYPE_WT == MTRR_TYPE_WRTHROUGH); +static_assert(X86_MEMTYPE_WP == MTRR_TYPE_WRPROT); +static_assert(X86_MEMTYPE_WB == MTRR_TYPE_WRBACK); + /* arch_phys_wc_add returns an MTRR register index plus this offset. */ #define MTRR_TO_PHYS_WC_OFFSET 1000 diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2392a7ef254df..504fe5ffd47b7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7070,7 +7070,7 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + (X86_MEMTYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); if (cpu_has_vmx_basic_inout()) msrs->basic |= VMX_BASIC_INOUT; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f18c2d8c7476e..a4d077db04cf6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2747,7 +2747,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, #endif /* Require Write-Back (WB) memory type for VMCS accesses. */ - if (((vmx_msr_high >> 18) & 15) != 6) + if (((vmx_msr_high >> 18) & 15) != X86_MEMTYPE_WB) return -EIO; rdmsrl(MSR_IA32_VMX_MISC, misc_msr); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index bdc2a240c2aae..15b888ebaf17e 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -176,15 +176,6 @@ static inline void set_page_memtype(struct page *pg, } #endif -enum { - PAT_UC = 0, /* uncached */ - PAT_WC = 1, /* Write combining */ - PAT_WT = 4, /* Write Through */ - PAT_WP = 5, /* Write Protected */ - PAT_WB = 6, /* Write Back (default) */ - PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */ -}; - #define CM(c) (_PAGE_CACHE_MODE_ ## c) static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val, @@ -194,13 +185,13 @@ static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val, char *cache_mode; switch (pat_val) { - case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; - case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; - case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; - case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; - case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; - case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; - default: cache = CM(WB); cache_mode = "WB "; break; + case X86_MEMTYPE_UC: cache = CM(UC); cache_mode = "UC "; break; + case X86_MEMTYPE_WC: cache = CM(WC); cache_mode = "WC "; break; + case X86_MEMTYPE_WT: cache = CM(WT); cache_mode = "WT "; break; + case X86_MEMTYPE_WP: cache = CM(WP); cache_mode = "WP "; break; + case X86_MEMTYPE_WB: cache = CM(WB); cache_mode = "WB "; break; + case X86_MEMTYPE_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; + default: cache = CM(WB); cache_mode = "WB "; break; } memcpy(msg, cache_mode, 4); @@ -257,11 +248,11 @@ void pat_cpu_init(void) void __init pat_bp_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; -#define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \ - (((u64)PAT_ ## p0) | ((u64)PAT_ ## p1 << 8) | \ - ((u64)PAT_ ## p2 << 16) | ((u64)PAT_ ## p3 << 24) | \ - ((u64)PAT_ ## p4 << 32) | ((u64)PAT_ ## p5 << 40) | \ - ((u64)PAT_ ## p6 << 48) | ((u64)PAT_ ## p7 << 56)) +#define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \ + ((X86_MEMTYPE_ ## p0) | (X86_MEMTYPE_ ## p1 << 8) | \ + (X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) | \ + (X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) | \ + (X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56)) if (!IS_ENABLED(CONFIG_X86_PAT)) From beb2e446046f8dd96bbeed3267b5f26bf1926ef9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 5 Jun 2024 16:19:10 -0700 Subject: [PATCH 023/352] x86/cpu: KVM: Move macro to encode PAT value to common header Move pat/memtype.c's PAT() macro to msr-index.h as PAT_VALUE(), and use it in KVM to define the default (Power-On / RESET) PAT value instead of open coding an inscrutable magic number. No functional change intended. Reviewed-by: Xiaoyao Li Reviewed-by: Kai Huang Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240605231918.2915961-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/msr-index.h | 6 ++++++ arch/x86/kvm/x86.h | 3 ++- arch/x86/mm/pat/memtype.c | 13 +++---------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index bb43a07616e64..f6eb3460e169d 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -377,6 +377,12 @@ #define MSR_IA32_CR_PAT 0x00000277 +#define PAT_VALUE(p0, p1, p2, p3, p4, p5, p6, p7) \ + ((X86_MEMTYPE_ ## p0) | (X86_MEMTYPE_ ## p1 << 8) | \ + (X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) | \ + (X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) | \ + (X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56)) + #define MSR_IA32_DEBUGCTLMSR 0x000001d9 #define MSR_IA32_LASTBRANCHFROMIP 0x000001db #define MSR_IA32_LASTBRANCHTOIP 0x000001dc diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 50596f6f83208..5585c2ef147ae 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -103,7 +103,8 @@ static inline unsigned int __shrink_ple_window(unsigned int val, return max(val, min); } -#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL +#define MSR_IA32_CR_PAT_DEFAULT \ + PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC) void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); int kvm_check_nested_events(struct kvm_vcpu *vcpu); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 15b888ebaf17e..6c4e29457c107 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -248,12 +248,6 @@ void pat_cpu_init(void) void __init pat_bp_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; -#define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \ - ((X86_MEMTYPE_ ## p0) | (X86_MEMTYPE_ ## p1 << 8) | \ - (X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) | \ - (X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) | \ - (X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56)) - if (!IS_ENABLED(CONFIG_X86_PAT)) pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n"); @@ -284,7 +278,7 @@ void __init pat_bp_init(void) * NOTE: When WC or WP is used, it is redirected to UC- per * the default setup in __cachemode2pte_tbl[]. */ - pat_msr_val = PAT(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC); + pat_msr_val = PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC); } /* @@ -319,7 +313,7 @@ void __init pat_bp_init(void) * NOTE: When WT or WP is used, it is redirected to UC- per * the default setup in __cachemode2pte_tbl[]. */ - pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC); + pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC); } else { /* * Full PAT support. We put WT in slot 7 to improve @@ -347,13 +341,12 @@ void __init pat_bp_init(void) * The reserved slots are unused, but mapped to their * corresponding types in the presence of PAT errata. */ - pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT); + pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT); } memory_caching_control |= CACHE_PAT; init_cache_modes(pat_msr_val); -#undef PAT } static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ From b6717d35d8597d19f44736f11963e3bd5b8881b4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 5 Jun 2024 16:19:11 -0700 Subject: [PATCH 024/352] KVM: x86: Stuff vCPU's PAT with default value at RESET, not creation Move the stuffing of the vCPU's PAT to the architectural "default" value from kvm_arch_vcpu_create() to kvm_vcpu_reset(), guarded by !init_event, to better capture that the default value is the value "Following Power-up or Reset". E.g. setting PAT only during creation would break if KVM were to expose a RESET ioctl() to userspace (which is unlikely, but that's not a good reason to have unintuitive code). No functional change. Reviewed-by: Xiaoyao Li Reviewed-by: Kai Huang Reviewed-by: Jim Mattson Reviewed-by: Zhao Liu Link: https://lore.kernel.org/r/20240605231918.2915961-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 70219e4069874..3ef6299ef8238 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12260,8 +12260,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); - vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; - kvm_async_pf_hash_reset(vcpu); vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; @@ -12427,6 +12425,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (!init_event) { vcpu->arch.smbase = 0x30000; + vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; + vcpu->arch.msr_misc_features_enables = 0; vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; From d7bfc9ffd58037ff86f9fd0c3cef77cccb555da3 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 5 Jun 2024 16:19:12 -0700 Subject: [PATCH 025/352] KVM: VMX: Move MSR_IA32_VMX_BASIC bit defines to asm/vmx.h Move the bit defines for MSR_IA32_VMX_BASIC from msr-index.h to vmx.h so that they are colocated with other VMX MSR bit defines, and with the helpers that extract specific information from an MSR_IA32_VMX_BASIC value. Opportunistically use BIT_ULL() instead of open coding hex values. Opportunistically rename VMX_BASIC_64 to VMX_BASIC_32BIT_PHYS_ADDR_ONLY, as "VMX_BASIC_64" is widly misleading. The flag enumerates that addresses are limited to 32 bits, not that 64-bit addresses are allowed. Last but not least, opportunistically #define DUAL_MONITOR_TREATMENT so that all known single-bit feature flags are defined (this will allow replacing open-coded literals in the future). Cc: Shan Kang Cc: Kai Huang Signed-off-by: Xin Li [sean: split to separate patch, write changelog] Reviewed-by: Zhao Liu Reviewed-by: Kai Huang Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240605231918.2915961-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/msr-index.h | 8 -------- arch/x86/include/asm/vmx.h | 7 +++++++ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f6eb3460e169d..ad0b579d1c01d 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1177,14 +1177,6 @@ #define MSR_IA32_VMX_VMFUNC 0x00000491 #define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492 -/* VMX_BASIC bits and bitmasks */ -#define VMX_BASIC_VMCS_SIZE_SHIFT 32 -#define VMX_BASIC_TRUE_CTLS (1ULL << 55) -#define VMX_BASIC_64 0x0001000000000000LLU -#define VMX_BASIC_MEM_TYPE_SHIFT 50 -#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU -#define VMX_BASIC_INOUT 0x0040000000000000LLU - /* Resctrl MSRs: */ /* - Intel: */ #define MSR_IA32_L3_QOS_CFG 0xc81 diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index e531d8d80a112..81b986e501a93 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -135,6 +135,13 @@ #define VMX_VMFUNC_EPTP_SWITCHING VMFUNC_CONTROL_BIT(EPTP_SWITCHING) #define VMFUNC_EPTP_ENTRIES 512 +#define VMX_BASIC_VMCS_SIZE_SHIFT 32 +#define VMX_BASIC_32BIT_PHYS_ADDR_ONLY BIT_ULL(48) +#define VMX_BASIC_DUAL_MONITOR_TREATMENT BIT_ULL(49) +#define VMX_BASIC_MEM_TYPE_SHIFT 50 +#define VMX_BASIC_INOUT BIT_ULL(54) +#define VMX_BASIC_TRUE_CTLS BIT_ULL(55) + static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) { return vmx_basic & GENMASK_ULL(30, 0); From 9df398ff7d2ab4fa2ecf6131044431dc94c5bdf6 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 5 Jun 2024 16:19:13 -0700 Subject: [PATCH 026/352] KVM: VMX: Track CPU's MSR_IA32_VMX_BASIC as a single 64-bit value Track the "basic" capabilities VMX MSR as a single u64 in vmcs_config instead of splitting it across three fields, that obviously don't combine into a single 64-bit value, so that KVM can use the macros that define MSR bits using their absolute position. Replace all open coded shifts and masks, many of which are relative to the "high" half, with the appropriate macro. Opportunistically use VMX_BASIC_32BIT_PHYS_ADDR_ONLY instead of an open coded equivalent, and clean up the related comment to not reference a specific SDM section (to the surprise of no one, the comment is stale). No functional change intended (though obviously the code generation will be quite different). Cc: Shan Kang Cc: Kai Huang Signed-off-by: Xin Li [sean: split to separate patch, write changelog] Reviewed-by: Xiaoyao Li Reviewed-by: Kai Huang Reviewed-by: Zhao Liu Link: https://lore.kernel.org/r/20240605231918.2915961-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/vmx.h | 5 +++++ arch/x86/kvm/vmx/capabilities.h | 6 ++---- arch/x86/kvm/vmx/vmx.c | 28 ++++++++++++++-------------- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 81b986e501a93..90963b14afaad 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -152,6 +152,11 @@ static inline u32 vmx_basic_vmcs_size(u64 vmx_basic) return (vmx_basic & GENMASK_ULL(44, 32)) >> 32; } +static inline u32 vmx_basic_vmcs_mem_type(u64 vmx_basic) +{ + return (vmx_basic & GENMASK_ULL(53, 50)) >> 50; +} + static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) { return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 41a4533f99897..86ce8bb96bed7 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -54,9 +54,7 @@ struct nested_vmx_msrs { }; struct vmcs_config { - int size; - u32 basic_cap; - u32 revision_id; + u64 basic; u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; u32 cpu_based_2nd_exec_ctrl; @@ -76,7 +74,7 @@ extern struct vmx_capability vmx_capability __ro_after_init; static inline bool cpu_has_vmx_basic_inout(void) { - return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); + return vmcs_config.basic & VMX_BASIC_INOUT; } static inline bool cpu_has_virtual_nmis(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a4d077db04cf6..5cbd86c4386b0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2605,13 +2605,13 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) static int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { - u32 vmx_msr_low, vmx_msr_high; u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; + u64 basic_msr; u64 misc_msr; int i; @@ -2734,29 +2734,29 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, _vmexit_control &= ~x_ctrl; } - rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); + rdmsrl(MSR_IA32_VMX_BASIC, basic_msr); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ - if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) + if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) return -EIO; #ifdef CONFIG_X86_64 - /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ - if (vmx_msr_high & (1u<<16)) + /* + * KVM expects to be able to shove all legal physical addresses into + * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always + * 0 for processors that support Intel 64 architecture". + */ + if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) return -EIO; #endif /* Require Write-Back (WB) memory type for VMCS accesses. */ - if (((vmx_msr_high >> 18) & 15) != X86_MEMTYPE_WB) + if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) return -EIO; rdmsrl(MSR_IA32_VMX_MISC, misc_msr); - vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; - - vmcs_conf->revision_id = vmx_msr_low; - + vmcs_conf->basic = basic_msr; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; @@ -2903,13 +2903,13 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) if (!pages) return NULL; vmcs = page_address(pages); - memset(vmcs, 0, vmcs_config.size); + memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); /* KVM supports Enlightened VMCS v1 only */ if (kvm_is_using_evmcs()) vmcs->hdr.revision_id = KVM_EVMCS_VERSION; else - vmcs->hdr.revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); if (shadow) vmcs->hdr.shadow_vmcs = 1; @@ -3002,7 +3002,7 @@ static __init int alloc_kvm_area(void) * physical CPU. */ if (kvm_is_using_evmcs()) - vmcs->hdr.revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); per_cpu(vmxarea, cpu) = vmcs; } From c97b106fa8aa500823695abfda7c9bdefb42a648 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 5 Jun 2024 16:19:14 -0700 Subject: [PATCH 027/352] KVM: nVMX: Use macros and #defines in vmx_restore_vmx_basic() Use macros in vmx_restore_vmx_basic() instead of open coding everything using BIT_ULL() and GENMASK_ULL(). Opportunistically split feature bits and reserved bits into separate variables, and add a comment explaining the subset logic (it's not immediately obvious that the set of feature bits is NOT the set of _supported_ feature bits). Cc: Shan Kang Cc: Kai Huang Signed-off-by: Xin Li [sean: split to separate patch, write changelog, drop #defines] Reviewed-by: Zhao Liu Reviewed-by: Xiaoyao Li Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20240605231918.2915961-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 504fe5ffd47b7..eba5e94a3c7c0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1251,21 +1251,32 @@ static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) { - const u64 feature_and_reserved = - /* feature (except bit 48; see below) */ - BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | - /* reserved */ - BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); + const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | + VMX_BASIC_INOUT | + VMX_BASIC_TRUE_CTLS; + + const u64 reserved_bits = GENMASK_ULL(63, 56) | + GENMASK_ULL(47, 45) | + BIT_ULL(31); + u64 vmx_basic = vmcs_config.nested.basic; - if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) + BUILD_BUG_ON(feature_bits & reserved_bits); + + /* + * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has + * inverted polarity), the incoming value must not set feature bits or + * reserved bits that aren't allowed/supported by KVM. Fields, i.e. + * multi-bit values, are explicitly checked below. + */ + if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) return -EINVAL; /* * KVM does not emulate a version of VMX that constrains physical * addresses of VMX structures (e.g. VMCS) to 32-bits. */ - if (data & BIT_ULL(48)) + if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) return -EINVAL; if (vmx_basic_vmcs_revision_id(vmx_basic) != From 92e648042c237c3bba223ab7f7e1a76f41ab3ef7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 5 Jun 2024 16:19:15 -0700 Subject: [PATCH 028/352] KVM: nVMX: Add a helper to encode VMCS info in MSR_IA32_VMX_BASIC Add a helper to encode the VMCS revision, size, and supported memory types in MSR_IA32_VMX_BASIC, i.e. when synthesizing KVM's supported BASIC MSR value, and delete the now unused VMCS size and memtype shift macros. For a variety of reasons, KVM has shifted (pun intended) to using helpers to *get* information from the VMX MSRs, as opposed to defined MASK and SHIFT macros for direct use. Provide a similar helper for the nested VMX code, which needs to *set* information, so that KVM isn't left with a mix of SHIFT macros and dedicated helpers. Reported-by: Xiaoyao Li Reviewed-by: Xiaoyao Li Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20240605231918.2915961-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/vmx.h | 7 +++++-- arch/x86/kvm/vmx/nested.c | 8 +++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 90963b14afaad..65aaf05772654 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -135,10 +135,8 @@ #define VMX_VMFUNC_EPTP_SWITCHING VMFUNC_CONTROL_BIT(EPTP_SWITCHING) #define VMFUNC_EPTP_ENTRIES 512 -#define VMX_BASIC_VMCS_SIZE_SHIFT 32 #define VMX_BASIC_32BIT_PHYS_ADDR_ONLY BIT_ULL(48) #define VMX_BASIC_DUAL_MONITOR_TREATMENT BIT_ULL(49) -#define VMX_BASIC_MEM_TYPE_SHIFT 50 #define VMX_BASIC_INOUT BIT_ULL(54) #define VMX_BASIC_TRUE_CTLS BIT_ULL(55) @@ -157,6 +155,11 @@ static inline u32 vmx_basic_vmcs_mem_type(u64 vmx_basic) return (vmx_basic & GENMASK_ULL(53, 50)) >> 50; } +static inline u64 vmx_basic_encode_vmcs_info(u32 revision, u16 size, u8 memtype) +{ + return revision | ((u64)size << 32) | ((u64)memtype << 50); +} + static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) { return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index eba5e94a3c7c0..335f1a6504977 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7077,12 +7077,10 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) * guest, and the VMCS structure we give it - not about the * VMX support of the underlying hardware. */ - msrs->basic = - VMCS12_REVISION | - VMX_BASIC_TRUE_CTLS | - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (X86_MEMTYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, + X86_MEMTYPE_WB); + msrs->basic |= VMX_BASIC_TRUE_CTLS; if (cpu_has_vmx_basic_inout()) msrs->basic |= VMX_BASIC_INOUT; } From dc1e67f70f6d4e336da06cb61bf6669d6fa39869 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 5 Jun 2024 16:19:16 -0700 Subject: [PATCH 029/352] KVM VMX: Move MSR_IA32_VMX_MISC bit defines to asm/vmx.h Move the handful of MSR_IA32_VMX_MISC bit defines that are currently in msr-indx.h to vmx.h so that all of the VMX_MISC defines and wrappers can be found in a single location. Opportunistically use BIT_ULL() instead of open coding hex values, add defines for feature bits that are architecturally defined, and move the defines down in the file so that they are colocated with the helpers for getting fields from VMX_MISC. No functional change intended. Cc: Shan Kang Cc: Kai Huang Signed-off-by: Xin Li [sean: split to separate patch, write changelog] Reviewed-by: Zhao Liu Reviewed-by: Kai Huang Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240605231918.2915961-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/msr-index.h | 5 ----- arch/x86/include/asm/vmx.h | 19 ++++++++++++------- arch/x86/kvm/vmx/capabilities.h | 4 ++-- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/nested.h | 2 +- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ad0b579d1c01d..72c2c0ecb62c8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1194,11 +1194,6 @@ #define MSR_IA32_SMBA_BW_BASE 0xc0000280 #define MSR_IA32_EVT_CFG_BASE 0xc0000400 -/* MSR_IA32_VMX_MISC bits */ -#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) -#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) -#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F - /* AMD-V MSRs */ #define MSR_VM_CR 0xc0010114 #define MSR_VM_IGNNE 0xc0010115 diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 65aaf05772654..400819ccb42c3 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -122,13 +122,6 @@ #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff -#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f -#define VMX_MISC_SAVE_EFER_LMA 0x00000020 -#define VMX_MISC_ACTIVITY_HLT 0x00000040 -#define VMX_MISC_ACTIVITY_WAIT_SIPI 0x00000100 -#define VMX_MISC_ZERO_LEN_INS 0x40000000 -#define VMX_MISC_MSR_LIST_MULTIPLIER 512 - /* VMFUNC functions */ #define VMFUNC_CONTROL_BIT(x) BIT((VMX_FEATURE_##x & 0x1f) - 28) @@ -160,6 +153,18 @@ static inline u64 vmx_basic_encode_vmcs_info(u32 revision, u16 size, u8 memtype) return revision | ((u64)size << 32) | ((u64)memtype << 50); } +#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK GENMASK_ULL(4, 0) +#define VMX_MISC_SAVE_EFER_LMA BIT_ULL(5) +#define VMX_MISC_ACTIVITY_HLT BIT_ULL(6) +#define VMX_MISC_ACTIVITY_SHUTDOWN BIT_ULL(7) +#define VMX_MISC_ACTIVITY_WAIT_SIPI BIT_ULL(8) +#define VMX_MISC_INTEL_PT BIT_ULL(14) +#define VMX_MISC_RDMSR_IN_SMM BIT_ULL(15) +#define VMX_MISC_VMXOFF_BLOCK_SMI BIT_ULL(28) +#define VMX_MISC_VMWRITE_SHADOW_RO_FIELDS BIT_ULL(29) +#define VMX_MISC_ZERO_LEN_INS BIT_ULL(30) +#define VMX_MISC_MSR_LIST_MULTIPLIER 512 + static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) { return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 86ce8bb96bed7..cb6588238f466 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -223,7 +223,7 @@ static inline bool cpu_has_vmx_vmfunc(void) static inline bool cpu_has_vmx_shadow_vmcs(void) { /* check if the cpu supports writing r/o exit information fields */ - if (!(vmcs_config.misc & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) + if (!(vmcs_config.misc & VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) return false; return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -365,7 +365,7 @@ static inline bool cpu_has_vmx_invvpid_global(void) static inline bool cpu_has_vmx_intel_pt(void) { - return (vmcs_config.misc & MSR_IA32_VMX_MISC_INTEL_PT) && + return (vmcs_config.misc & VMX_MISC_INTEL_PT) && (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 335f1a6504977..72db3718d8b21 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7062,7 +7062,7 @@ static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, { msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; msrs->misc_low |= - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT | VMX_MISC_ACTIVITY_WAIT_SIPI; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index cce4e2aa30fbf..0782fe599757a 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -109,7 +109,7 @@ static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.msrs.misc_low & - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; } static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu) From 8f56b14e9fa0f91daf4d3be05988abf336255dc6 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 5 Jun 2024 16:19:17 -0700 Subject: [PATCH 030/352] KVM: VMX: Open code VMX preemption timer rate mask in its accessor Use vmx_misc_preemption_timer_rate() to get the rate in hardware_setup(), and open code the rate's bitmask in vmx_misc_preemption_timer_rate() so that the function looks like all the helpers that grab values from VMX_BASIC and VMX_MISC MSR values. No functional change intended. Cc: Shan Kang Cc: Kai Huang Signed-off-by: Xin Li [sean: split to separate patch, write changelog] Reviewed-by: Kai Huang Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240605231918.2915961-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/vmx.h | 3 +-- arch/x86/kvm/vmx/vmx.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 400819ccb42c3..f7fd4369b8218 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -153,7 +153,6 @@ static inline u64 vmx_basic_encode_vmcs_info(u32 revision, u16 size, u8 memtype) return revision | ((u64)size << 32) | ((u64)memtype << 50); } -#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK GENMASK_ULL(4, 0) #define VMX_MISC_SAVE_EFER_LMA BIT_ULL(5) #define VMX_MISC_ACTIVITY_HLT BIT_ULL(6) #define VMX_MISC_ACTIVITY_SHUTDOWN BIT_ULL(7) @@ -167,7 +166,7 @@ static inline u64 vmx_basic_encode_vmcs_info(u32 revision, u16 size, u8 memtype) static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) { - return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + return vmx_misc & GENMASK_ULL(4, 0); } static inline int vmx_misc_cr3_count(u64 vmx_misc) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5cbd86c4386b0..39a26ecf87ea9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8517,7 +8517,7 @@ __init int vmx_hardware_setup(void) u64 use_timer_freq = 5000ULL * 1000 * 1000; cpu_preemption_timer_multi = - vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + vmx_misc_preemption_timer_rate(vmcs_config.misc); if (tsc_khz) use_timer_freq = (u64)tsc_khz * 1000; From 566975f6ecd85247bd8989884d7b909d5a456da1 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 5 Jun 2024 16:19:18 -0700 Subject: [PATCH 031/352] KVM: nVMX: Use macros and #defines in vmx_restore_vmx_misc() Use macros in vmx_restore_vmx_misc() instead of open coding everything using BIT_ULL() and GENMASK_ULL(). Opportunistically split feature bits and reserved bits into separate variables, and add a comment explaining the subset logic (it's not immediately obvious that the set of feature bits is NOT the set of _supported_ feature bits). Cc: Shan Kang Cc: Kai Huang Signed-off-by: Xin Li [sean: split to separate patch, write changelog, drop #defines] Reviewed-by: Xiaoyao Li Reviewed-by: Kai Huang Reviewed-by: Zhao Liu Link: https://lore.kernel.org/r/20240605231918.2915961-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 72db3718d8b21..97ecc2722c8fd 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1345,16 +1345,29 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) { - const u64 feature_and_reserved_bits = - /* feature */ - BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | - BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | - /* reserved */ - GENMASK_ULL(13, 9) | BIT_ULL(31); + const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | + VMX_MISC_ACTIVITY_HLT | + VMX_MISC_ACTIVITY_SHUTDOWN | + VMX_MISC_ACTIVITY_WAIT_SIPI | + VMX_MISC_INTEL_PT | + VMX_MISC_RDMSR_IN_SMM | + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | + VMX_MISC_VMXOFF_BLOCK_SMI | + VMX_MISC_ZERO_LEN_INS; + + const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); + u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, vmcs_config.nested.misc_high); - if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) + BUILD_BUG_ON(feature_bits & reserved_bits); + + /* + * The incoming value must not set feature bits or reserved bits that + * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are + * explicitly checked below. + */ + if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) return -EINVAL; if ((vmx->nested.msrs.pinbased_ctls_high & From c501062bb22ba325b7b77c91433d79574b4a3dcc Mon Sep 17 00:00:00 2001 From: Yongqiang Liu Date: Wed, 21 Aug 2024 19:27:37 +0800 Subject: [PATCH 032/352] KVM: SVM: Remove unnecessary GFP_KERNEL_ACCOUNT in svm_set_nested_state() The fixed size temporary variables vmcb_control_area and vmcb_save_area allocated in svm_set_nested_state() are released when the function exits. Meanwhile, svm_set_nested_state() also have vcpu mutex held to avoid massive concurrency allocation, so we don't need to set GFP_KERNEL_ACCOUNT. Signed-off-by: Yongqiang Liu Link: https://lore.kernel.org/r/20240821112737.3649937-1-liuyongqiang13@huawei.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 6f704c1037e51..d5314cb7dff4c 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1693,8 +1693,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; ret = -ENOMEM; - ctl = kzalloc(sizeof(*ctl), GFP_KERNEL_ACCOUNT); - save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT); + ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); + save = kzalloc(sizeof(*save), GFP_KERNEL); if (!ctl || !save) goto out_free; From caf22c6dd31294d136e527d0548f8697c7e72f37 Mon Sep 17 00:00:00 2001 From: Qiang Liu Date: Tue, 2 Jul 2024 06:46:09 +0000 Subject: [PATCH 033/352] KVM: VMX: Modify the BUILD_BUG_ON_MSG of the 32-bit field in the vmcs_check16 function According to the SDM, the meaning of field bit 0 is: Access type (0 = full; 1 = high); must be full for 16-bit, 32-bit, and natural-width fields. So there is no 32-bit high field here, it should be a 32-bit field instead. Signed-off-by: Qiang Liu Link: https://lore.kernel.org/r/20240702064609.52487-1-liuq131@chinatelecom.cn Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx_ops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 8060e5fc6dbd8..93e020dc88f65 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -47,7 +47,7 @@ static __always_inline void vmcs_check16(unsigned long field) BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, "16-bit accessor invalid for 64-bit high field"); BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "16-bit accessor invalid for 32-bit high field"); + "16-bit accessor invalid for 32-bit field"); BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, "16-bit accessor invalid for natural width field"); } From d9aa56edad3536f8b24c4695d51725e8d33b3c46 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Mon, 15 Jul 2024 22:12:24 +1200 Subject: [PATCH 034/352] KVM: VMX: Do not account for temporary memory allocation in ECREATE emulation In handle_encls_ecreate(), a page is allocated to store a copy of SECS structure used by the ENCLS[ECREATE] leaf from the guest. This page is only used temporarily and is freed after use in handle_encls_ecreate(). Don't account for the memory allocation of this page per [1]. Link: https://lore.kernel.org/kvm/b999afeb588eb75d990891855bc6d58861968f23.camel@intel.com/T/#mb81987afc3ab308bbb5861681aa9a20f2aece7fd [1] Signed-off-by: Kai Huang Link: https://lore.kernel.org/r/20240715101224.90958-1-kai.huang@intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/sgx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 6fef01e0536e5..a3c3d2a51f47d 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -274,7 +274,7 @@ static int handle_encls_ecreate(struct kvm_vcpu *vcpu) * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to * enforce restriction of access to the PROVISIONKEY. */ - contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT); + contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL); if (!contents) return -ENOMEM; From 653ea4489e6989f14a87abf8653f77c089097326 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 22 Jul 2024 16:59:22 -0700 Subject: [PATCH 035/352] KVM: nVMX: Honor userspace MSR filter lists for nested VM-Enter/VM-Exit Synthesize a consistency check VM-Exit (VM-Enter) or VM-Abort (VM-Exit) if L1 attempts to load/store an MSR via the VMCS MSR lists that userspace has disallowed access to via an MSR filter. Intel already disallows including a handful of "special" MSRs in the VMCS lists, so denying access isn't completely without precedent. More importantly, the behavior is well-defined _and_ can be communicated the end user, e.g. to the customer that owns a VM running as L1 on top of KVM. On the other hand, ignoring userspace MSR filters is all but guaranteed to result in unexpected behavior as the access will hit KVM's internal state, which is likely not up-to-date. Unlike KVM-internal accesses, instruction emulation, and dedicated VMCS fields, the MSRs in the VMCS load/store lists are 100% guest controlled, thus making it all but impossible to reason about the correctness of ignoring the MSR filter. And if userspace *really* wants to deny access to MSRs via the aforementioned scenarios, userspace can hide the associated feature from the guest, e.g. by disabling the PMU to prevent accessing PERF_GLOBAL_CTRL via its VMCS field. But for the MSR lists, KVM is blindly processing MSRs; the MSR filters are the _only_ way for userspace to deny access. This partially reverts commit ac8d6cad3c7b ("KVM: x86: Only do MSR filtering when access MSR by rdmsr/wrmsr"). Cc: Hou Wenlong Cc: Jim Mattson Link: https://lore.kernel.org/r/20240722235922.3351122-1-seanjc@google.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 23 +++++++++++++++++++---- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/vmx/nested.c | 12 ++++++------ arch/x86/kvm/x86.c | 6 ++++-- 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index b3be87489108e..a4b7dc4a9ddaa 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4214,7 +4214,9 @@ whether or not KVM_CAP_X86_USER_SPACE_MSR's KVM_MSR_EXIT_REASON_FILTER is enabled. If KVM_MSR_EXIT_REASON_FILTER is enabled, KVM will exit to userspace on denied accesses, i.e. userspace effectively intercepts the MSR access. If KVM_MSR_EXIT_REASON_FILTER is not enabled, KVM will inject a #GP into the guest -on denied accesses. +on denied accesses. Note, if an MSR access is denied during emulation of MSR +load/stores during VMX transitions, KVM ignores KVM_MSR_EXIT_REASON_FILTER. +See the below warning for full details. If an MSR access is allowed by userspace, KVM will emulate and/or virtualize the access in accordance with the vCPU model. Note, KVM may still ultimately @@ -4229,9 +4231,22 @@ filtering. In that mode, ``KVM_MSR_FILTER_DEFAULT_DENY`` is invalid and causes an error. .. warning:: - MSR accesses as part of nested VM-Enter/VM-Exit are not filtered. - This includes both writes to individual VMCS fields and reads/writes - through the MSR lists pointed to by the VMCS. + MSR accesses that are side effects of instruction execution (emulated or + native) are not filtered as hardware does not honor MSR bitmaps outside of + RDMSR and WRMSR, and KVM mimics that behavior when emulating instructions + to avoid pointless divergence from hardware. E.g. RDPID reads MSR_TSC_AUX, + SYSENTER reads the SYSENTER MSRs, etc. + + MSRs that are loaded/stored via dedicated VMCS fields are not filtered as + part of VM-Enter/VM-Exit emulation. + + MSRs that are loaded/store via VMX's load/store lists _are_ filtered as part + of VM-Enter/VM-Exit emulation. If an MSR access is denied on VM-Enter, KVM + synthesizes a consistency check VM-Exit(EXIT_REASON_MSR_LOAD_FAIL). If an + MSR access is denied on VM-Exit, KVM synthesizes a VM-Abort. In short, KVM + extends Intel's architectural list of MSRs that cannot be loaded/saved via + the VM-Enter/VM-Exit MSR list. It is platform owner's responsibility to + to communicate any such restrictions to their end users. x2APIC MSR accesses cannot be filtered (KVM silently ignores filters that cover any x2APIC MSRs). diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4a68cb3eba78f..4a93ac1b9be91 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2060,6 +2060,8 @@ void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu); void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); +int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data); +int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data); int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2392a7ef254df..674f7089cc449 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -981,7 +981,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) __func__, i, e.index, e.reserved); goto fail; } - if (kvm_set_msr(vcpu, e.index, e.value)) { + if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) { pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, e.value); @@ -1017,7 +1017,7 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, } } - if (kvm_get_msr(vcpu, msr_index, data)) { + if (kvm_get_msr_with_filter(vcpu, msr_index, data)) { pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, msr_index); return false; @@ -1112,9 +1112,9 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, /* * Emulated VMEntry does not fail here. Instead a less * accurate value will be returned by - * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() - * instead of reading the value from the vmcs02 VMExit - * MSR-store area. + * nested_vmx_get_vmexit_msr_value() by reading KVM's + * internal MSR state instead of reading the value from + * the vmcs02 VMExit MSR-store area. */ pr_warn_ratelimited( "Not enough msr entries in msr_autostore. Can't add msr %x\n", @@ -4806,7 +4806,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) goto vmabort; } - if (kvm_set_msr(vcpu, h.index, h.value)) { + if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) { pr_debug_ratelimited( "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", __func__, j, h.index, h.value); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 70219e4069874..34b52b49f5e68 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1940,19 +1940,21 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, return ret; } -static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) +int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) { if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) return KVM_MSR_RET_FILTERED; return kvm_get_msr_ignored_check(vcpu, index, data, false); } +EXPORT_SYMBOL_GPL(kvm_get_msr_with_filter); -static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) +int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) { if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) return KVM_MSR_RET_FILTERED; return kvm_set_msr_ignored_check(vcpu, index, data, false); } +EXPORT_SYMBOL_GPL(kvm_set_msr_with_filter); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) { From 41ab0d59faa9532bbd37c91b03a8e9fb0215d67c Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 25 Jul 2024 13:52:31 -0400 Subject: [PATCH 036/352] KVM: nVMX: Use vmx_segment_cache_clear() instead of open coded equivalent In prepare_vmcs02_rare(), call vmx_segment_cache_clear() instead of setting segment_cache.bitmask directly. Using the helper minimizes the chances of prepare_vmcs02_rare() doing the wrong thing in the future, e.g. if KVM ends up doing more than just zero the bitmask when purging the cache. No functional change intended. Signed-off-by: Maxim Levitsky Link: https://lore.kernel.org/r/20240725175232.337266-2-mlevitsk@redhat.com [sean: massage changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 3 ++- arch/x86/kvm/vmx/vmx.c | 4 ---- arch/x86/kvm/vmx/vmx.h | 5 +++++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 674f7089cc449..867de342df33d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2470,6 +2470,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { + vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); @@ -2507,7 +2508,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); - vmx->segment_cache.bitmask = 0; + vmx_segment_cache_clear(vmx); } if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f18c2d8c7476e..594db9afbc0fb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -525,10 +525,6 @@ static const struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; -static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) -{ - vmx->segment_cache.bitmask = 0; -} static unsigned long host_idt_base; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 42498fa63abbe..11b1b70faef29 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -756,4 +756,9 @@ static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && enable_ipiv; } +static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) +{ + vmx->segment_cache.bitmask = 0; +} + #endif /* __KVM_X86_VMX_H */ From 2ab637df5f68d4e0cfa9becd10f43400aee785b3 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 16 Aug 2024 15:01:24 +0200 Subject: [PATCH 037/352] KVM: VMX: hyper-v: Prevent impossible NULL pointer dereference in evmcs_load() GCC 12.3.0 complains about a potential NULL pointer dereference in evmcs_load() as hv_get_vp_assist_page() can return NULL. In fact, this cannot happen because KVM verifies (hv_init_evmcs()) that every CPU has a valid VP assist page and aborts enabling the feature otherwise. CPU onlining path is also checked in vmx_hardware_enable(). To make the compiler happy and to future proof the code, add a KVM_BUG_ON() sentinel. It doesn't seem to be possible (and logical) to observe evmcs_load() happening without an active vCPU so it is presumed that kvm_get_running_vcpu() can't return NULL. No functional change intended. Reported-by: Mirsad Todorovac Suggested-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240816130124.286226-1-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx_onhyperv.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.h b/arch/x86/kvm/vmx/vmx_onhyperv.h index eb48153bfd73c..bba24ed99ee6c 100644 --- a/arch/x86/kvm/vmx/vmx_onhyperv.h +++ b/arch/x86/kvm/vmx/vmx_onhyperv.h @@ -104,6 +104,14 @@ static inline void evmcs_load(u64 phys_addr) struct hv_vp_assist_page *vp_ap = hv_get_vp_assist_page(smp_processor_id()); + /* + * When enabling eVMCS, KVM verifies that every CPU has a valid hv_vp_assist_page() + * and aborts enabling the feature otherwise. CPU onlining path is also checked in + * vmx_hardware_enable(). + */ + if (KVM_BUG_ON(!vp_ap, kvm_get_running_vcpu()->kvm)) + return; + if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall) vp_ap->nested_control.features.directhypercall = 1; vp_ap->current_nested_vmcs = phys_addr; From e0183a42e3bcd4c30eb95bb046c016023fdc01ce Mon Sep 17 00:00:00 2001 From: Li Chen Date: Mon, 19 Aug 2024 13:59:27 +0800 Subject: [PATCH 038/352] KVM: x86: Use this_cpu_ptr() in kvm_user_return_msr_cpu_online Use this_cpu_ptr() instead of open coding the equivalent in kvm_user_return_msr_cpu_online. Signed-off-by: Li Chen Link: https://lore.kernel.org/r/87zfp96ojk.wl-me@linux.beauty Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 70219e4069874..ffdf251bfef5b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -413,8 +413,7 @@ EXPORT_SYMBOL_GPL(kvm_find_user_return_msr); static void kvm_user_return_msr_cpu_online(void) { - unsigned int cpu = smp_processor_id(); - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); u64 value; int i; From acf2923271ef3611ceab7004fc8798d3ba31b739 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:20:06 -0700 Subject: [PATCH 039/352] KVM: x86/mmu: Clean up function comments for dirty logging APIs Rework the function comment for kvm_arch_mmu_enable_log_dirty_pt_masked() into the body of the function, as it has gotten a bit stale, is harder to read without the code context, and is the last source of warnings for W=1 builds in KVM x86 due to using a kernel-doc comment without documenting all parameters. Opportunistically subsume the functions comments for kvm_mmu_write_protect_pt_masked() and kvm_mmu_clear_dirty_pt_masked(), as there is no value in regurgitating similar information at a higher level, and capturing the differences between write-protection and PML-based dirty logging is best done in a common location. No functional change intended. Cc: David Matlack Reviewed-by: Kai Huang Reviewed-by: Pankaj Gupta Link: https://lore.kernel.org/r/20240802202006.340854-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 48 +++++++++++++----------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 928cf84778b0c..5226bb055d99c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1307,15 +1307,6 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, return flush; } -/** - * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages - * @kvm: kvm instance - * @slot: slot to protect - * @gfn_offset: start of the BITS_PER_LONG pages we care about - * @mask: indicates which pages we should protect - * - * Used when we do not need to care about huge page mappings. - */ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) @@ -1339,16 +1330,6 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } } -/** - * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write - * protect the page if the D-bit isn't supported. - * @kvm: kvm instance - * @slot: slot to clear D-bit - * @gfn_offset: start of the BITS_PER_LONG pages we care about - * @mask: indicates which pages we should clear D-bit - * - * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. - */ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) @@ -1372,24 +1353,16 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, } } -/** - * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected - * PT level pages. - * - * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to - * enable dirty logging for them. - * - * We need to care about huge page mappings: e.g. during dirty logging we may - * have such mappings. - */ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { /* - * Huge pages are NOT write protected when we start dirty logging in - * initially-all-set mode; must write protect them here so that they - * are split to 4K on the first write. + * If the slot was assumed to be "initially all dirty", write-protect + * huge pages to ensure they are split to 4KiB on the first write (KVM + * dirty logs at 4KiB granularity). If eager page splitting is enabled, + * immediately try to split huge pages, e.g. so that vCPUs don't get + * saddled with the cost of splitting. * * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn * of memslot has no such restriction, so the range can cross two large @@ -1411,7 +1384,16 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, PG_LEVEL_2M); } - /* Now handle 4K PTEs. */ + /* + * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in + * mask. If PML is enabled and the GFN doesn't need to be write- + * protected for other reasons, e.g. shadow paging, clear the Dirty bit. + * Otherwise clear the Writable bit. + * + * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is + * enabled but it chooses between clearing the Dirty bit and Writeable + * bit based on the context. + */ if (kvm_x86_ops.cpu_dirty_log_size) kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else From 74a0e79df68a8042fb84fd7207e57b70722cf825 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 11:19:26 -0700 Subject: [PATCH 040/352] KVM: SVM: Disallow guest from changing userspace's MSR_AMD64_DE_CFG value Inject a #GP if the guest attempts to change MSR_AMD64_DE_CFG from its *current* value, not if the guest attempts to write a value other than KVM's set of supported bits. As per the comment and the changelog of the original code, the intent is to effectively make MSR_AMD64_DE_CFG read- only for the guest. Opportunistically use a more conventional equality check instead of an exclusive-OR check to detect attempts to change bits. Fixes: d1d93fa90f1a ("KVM: SVM: Add MSR-based feature support for serializing LFENCE") Cc: Tom Lendacky Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240802181935.292540-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d6f252555ab3f..eaa41a6a00ee6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3189,8 +3189,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & ~msr_entry.data) return 1; - /* Don't allow the guest to change a bit, #GP */ - if (!msr->host_initiated && (data ^ msr_entry.data)) + /* + * Don't let the guest change the host-programmed value. The + * MSR is very model specific, i.e. contains multiple bits that + * are completely unknown to KVM, and the one bit known to KVM + * is simply a reflection of hardware capabilities. + */ + if (!msr->host_initiated && data != svm->msr_decfg) return 1; svm->msr_decfg = data; From b58b808cbe93e8abe936b285ae534c9927789242 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 11:19:27 -0700 Subject: [PATCH 041/352] KVM: x86: Move MSR_TYPE_{R,W,RW} values from VMX to x86, as enums Move VMX's MSR_TYPE_{R,W,RW} #defines to x86.h, as enums, so that they can be used by common x86 code, e.g. instead of doing "bool write". Opportunistically tweak the definitions to make it more obvious that the values are bitmasks, not arbitrary ascending values. No functional change intended. Link: https://lore.kernel.org/r/20240802181935.292540-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.h | 4 ---- arch/x86/kvm/x86.h | 6 ++++++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 42498fa63abbe..3839afb921e22 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -17,10 +17,6 @@ #include "run_flags.h" #include "../mmu.h" -#define MSR_TYPE_R 1 -#define MSR_TYPE_W 2 -#define MSR_TYPE_RW 3 - #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 50596f6f83208..499adef960385 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -504,6 +504,12 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); +enum kvm_msr_access { + MSR_TYPE_R = BIT(0), + MSR_TYPE_W = BIT(1), + MSR_TYPE_RW = MSR_TYPE_R | MSR_TYPE_W, +}; + /* * Internal error codes that are used to indicate that MSR emulation encountered * an error that should result in #GP in the guest, unless userspace From aaecae7b6a2b19a874a7df0d474f44f3a5b5a74e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 11:19:28 -0700 Subject: [PATCH 042/352] KVM: x86: Rename KVM_MSR_RET_INVALID to KVM_MSR_RET_UNSUPPORTED Rename the "INVALID" internal MSR error return code to "UNSUPPORTED" to try and make it more clear that access was denied because the MSR itself is unsupported/unknown. "INVALID" is too ambiguous, as it could just as easily mean the value for WRMSR as invalid. Avoid UNKNOWN and UNIMPLEMENTED, as the error code is used for MSRs that _are_ actually implemented by KVM, e.g. if the MSR is unsupported because an associated feature flag is not present in guest CPUID. Opportunistically beef up the comments for the internal MSR error codes. Link: https://lore.kernel.org/r/20240802181935.292540-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.c | 12 ++++++------ arch/x86/kvm/x86.h | 15 +++++++++++---- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index eaa41a6a00ee6..a265f6e621fcc 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2835,7 +2835,7 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; break; default: - return KVM_MSR_RET_INVALID; + return KVM_MSR_RET_UNSUPPORTED; } return 0; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f18c2d8c7476e..e5b253e4d4215 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2006,7 +2006,7 @@ int vmx_get_msr_feature(struct kvm_msr_entry *msr) return 1; return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); default: - return KVM_MSR_RET_INVALID; + return KVM_MSR_RET_UNSUPPORTED; } } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ffdf251bfef5b..ea2f3ff939574 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1687,7 +1687,7 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) msr.index = index; r = kvm_get_msr_feature(&msr); - if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false)) + if (r == KVM_MSR_RET_UNSUPPORTED && kvm_msr_ignored_check(index, 0, false)) r = 0; *data = msr.data; @@ -1884,7 +1884,7 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, { int ret = __kvm_set_msr(vcpu, index, data, host_initiated); - if (ret == KVM_MSR_RET_INVALID) + if (ret == KVM_MSR_RET_UNSUPPORTED) if (kvm_msr_ignored_check(index, data, true)) ret = 0; @@ -1929,7 +1929,7 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, { int ret = __kvm_get_msr(vcpu, index, data, host_initiated); - if (ret == KVM_MSR_RET_INVALID) { + if (ret == KVM_MSR_RET_UNSUPPORTED) { /* Unconditionally clear *data for simplicity */ *data = 0; if (kvm_msr_ignored_check(index, 0, false)) @@ -1998,7 +1998,7 @@ static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) static u64 kvm_msr_reason(int r) { switch (r) { - case KVM_MSR_RET_INVALID: + case KVM_MSR_RET_UNSUPPORTED: return KVM_MSR_EXIT_REASON_UNKNOWN; case KVM_MSR_RET_FILTERED: return KVM_MSR_EXIT_REASON_FILTER; @@ -4146,7 +4146,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_is_msr_to_save(msr)) break; - return KVM_MSR_RET_INVALID; + return KVM_MSR_RET_UNSUPPORTED; } return 0; } @@ -4507,7 +4507,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; } - return KVM_MSR_RET_INVALID; + return KVM_MSR_RET_UNSUPPORTED; } return 0; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 499adef960385..f47b9905ba782 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -512,11 +512,18 @@ enum kvm_msr_access { /* * Internal error codes that are used to indicate that MSR emulation encountered - * an error that should result in #GP in the guest, unless userspace - * handles it. + * an error that should result in #GP in the guest, unless userspace handles it. + * Note, '1', '0', and negative numbers are off limits, as they are used by KVM + * as part of KVM's lightly documented internal KVM_RUN return codes. + * + * UNSUPPORTED - The MSR isn't supported, either because it is completely + * unknown to KVM, or because the MSR should not exist according + * to the vCPU model. + * + * FILTERED - Access to the MSR is denied by a userspace MSR filter. */ -#define KVM_MSR_RET_INVALID 2 /* in-kernel MSR emulation #GP condition */ -#define KVM_MSR_RET_FILTERED 3 /* #GP due to userspace MSR filter */ +#define KVM_MSR_RET_UNSUPPORTED 2 +#define KVM_MSR_RET_FILTERED 3 #define __cr4_reserved_bits(__cpu_has, __c) \ ({ \ From 74c6c98a598a1fa650f9f8dfb095d66e987ed9cf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 11:19:29 -0700 Subject: [PATCH 043/352] KVM: x86: Refactor kvm_x86_ops.get_msr_feature() to avoid kvm_msr_entry Refactor get_msr_feature() to take the index and data pointer as distinct parameters in anticipation of eliminating "struct kvm_msr_entry" usage further up the primary callchain. No functional change intended. Link: https://lore.kernel.org/r/20240802181935.292540-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/svm.c | 16 +++++++--------- arch/x86/kvm/vmx/vmx.c | 6 +++--- arch/x86/kvm/vmx/x86_ops.h | 2 +- arch/x86/kvm/x86.c | 2 +- 5 files changed, 13 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4a68cb3eba78f..fe61cff1f49d8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1806,7 +1806,7 @@ struct kvm_x86_ops { int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); void (*guest_memory_reclaimed)(struct kvm *kvm); - int (*get_msr_feature)(struct kvm_msr_entry *entry); + int (*get_msr_feature)(u32 msr, u64 *data); int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a265f6e621fcc..314dd4aacfe97 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2825,14 +2825,14 @@ static int efer_trap(struct kvm_vcpu *vcpu) return kvm_complete_insn_gp(vcpu, ret); } -static int svm_get_msr_feature(struct kvm_msr_entry *msr) +static int svm_get_msr_feature(u32 msr, u64 *data) { - msr->data = 0; + *data = 0; - switch (msr->index) { + switch (msr) { case MSR_AMD64_DE_CFG: if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) - msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; + *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; break; default: return KVM_MSR_RET_UNSUPPORTED; @@ -3179,14 +3179,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; case MSR_AMD64_DE_CFG: { - struct kvm_msr_entry msr_entry; + u64 supported_de_cfg; - msr_entry.index = msr->index; - if (svm_get_msr_feature(&msr_entry)) + if (svm_get_msr_feature(ecx, &supported_de_cfg)) return 1; - /* Check the supported bits */ - if (data & ~msr_entry.data) + if (data & ~supported_de_cfg) return 1; /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e5b253e4d4215..3d24eb4aeca26 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1998,13 +1998,13 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, return !(msr->data & ~valid_bits); } -int vmx_get_msr_feature(struct kvm_msr_entry *msr) +int vmx_get_msr_feature(u32 msr, u64 *data) { - switch (msr->index) { + switch (msr) { case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!nested) return 1; - return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); + return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); default: return KVM_MSR_RET_UNSUPPORTED; } diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index ce3221cd1d01a..9a0304eb847b2 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -56,7 +56,7 @@ bool vmx_has_emulated_msr(struct kvm *kvm, u32 index); void vmx_msr_filter_changed(struct kvm_vcpu *vcpu); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); -int vmx_get_msr_feature(struct kvm_msr_entry *msr); +int vmx_get_msr_feature(u32 msr, u64 *data); int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ea2f3ff939574..29d1205f62d39 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1672,7 +1672,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) rdmsrl_safe(msr->index, &msr->data); break; default: - return kvm_x86_call(get_msr_feature)(msr); + return kvm_x86_call(get_msr_feature)(msr->index, &msr->data); } return 0; } From b848f24bd74a699745c1145e8cb707884d80694e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 11:19:30 -0700 Subject: [PATCH 044/352] KVM: x86: Rename get_msr_feature() APIs to get_feature_msr() Rename all APIs related to feature MSRs from get_msr_feature() to get_feature_msr(). The APIs get "feature MSRs", not "MSR features". And unlike kvm_{g,s}et_msr_common(), the "feature" adjective doesn't describe the helper itself. No functional change intended. Link: https://lore.kernel.org/r/20240802181935.292540-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-ops.h | 2 +- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/svm.c | 6 +++--- arch/x86/kvm/vmx/main.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/x86_ops.h | 2 +- arch/x86/kvm/x86.c | 12 ++++++------ 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 68ad4f923664e..9afbf8bcb5210 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -125,7 +125,7 @@ KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) -KVM_X86_OP(get_msr_feature) +KVM_X86_OP(get_feature_msr) KVM_X86_OP(check_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fe61cff1f49d8..95396e4cb3dad 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1806,7 +1806,7 @@ struct kvm_x86_ops { int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); void (*guest_memory_reclaimed)(struct kvm *kvm); - int (*get_msr_feature)(u32 msr, u64 *data); + int (*get_feature_msr)(u32 msr, u64 *data); int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 314dd4aacfe97..d8cfe8f233276 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2825,7 +2825,7 @@ static int efer_trap(struct kvm_vcpu *vcpu) return kvm_complete_insn_gp(vcpu, ret); } -static int svm_get_msr_feature(u32 msr, u64 *data) +static int svm_get_feature_msr(u32 msr, u64 *data) { *data = 0; @@ -3181,7 +3181,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_AMD64_DE_CFG: { u64 supported_de_cfg; - if (svm_get_msr_feature(ecx, &supported_de_cfg)) + if (svm_get_feature_msr(ecx, &supported_de_cfg)) return 1; if (data & ~supported_de_cfg) @@ -5002,7 +5002,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_unblocking = avic_vcpu_unblocking, .update_exception_bitmap = svm_update_exception_bitmap, - .get_msr_feature = svm_get_msr_feature, + .get_feature_msr = svm_get_feature_msr, .get_msr = svm_get_msr, .set_msr = svm_set_msr, .get_segment_base = svm_get_segment_base, diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 0bf35ebe8a1bd..4f6023a0deb34 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -41,7 +41,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .vcpu_put = vmx_vcpu_put, .update_exception_bitmap = vmx_update_exception_bitmap, - .get_msr_feature = vmx_get_msr_feature, + .get_feature_msr = vmx_get_feature_msr, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, .get_segment_base = vmx_get_segment_base, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3d24eb4aeca26..cf85f8d50ccb3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1998,7 +1998,7 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, return !(msr->data & ~valid_bits); } -int vmx_get_msr_feature(u32 msr, u64 *data) +int vmx_get_feature_msr(u32 msr, u64 *data) { switch (msr) { case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 9a0304eb847b2..eeafd121fb089 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -56,7 +56,7 @@ bool vmx_has_emulated_msr(struct kvm *kvm, u32 index); void vmx_msr_filter_changed(struct kvm_vcpu *vcpu); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); -int vmx_get_msr_feature(u32 msr, u64 *data); +int vmx_get_feature_msr(u32 msr, u64 *data); int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 29d1205f62d39..3efe086bea4cc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1659,7 +1659,7 @@ static u64 kvm_get_arch_capabilities(void) return data; } -static int kvm_get_msr_feature(struct kvm_msr_entry *msr) +static int kvm_get_feature_msr(struct kvm_msr_entry *msr) { switch (msr->index) { case MSR_IA32_ARCH_CAPABILITIES: @@ -1672,12 +1672,12 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) rdmsrl_safe(msr->index, &msr->data); break; default: - return kvm_x86_call(get_msr_feature)(msr->index, &msr->data); + return kvm_x86_call(get_feature_msr)(msr->index, &msr->data); } return 0; } -static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { struct kvm_msr_entry msr; int r; @@ -1685,7 +1685,7 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) /* Unconditionally clear the output for simplicity */ msr.data = 0; msr.index = index; - r = kvm_get_msr_feature(&msr); + r = kvm_get_feature_msr(&msr); if (r == KVM_MSR_RET_UNSUPPORTED && kvm_msr_ignored_check(index, 0, false)) r = 0; @@ -4943,7 +4943,7 @@ long kvm_arch_dev_ioctl(struct file *filp, break; } case KVM_GET_MSRS: - r = msr_io(NULL, argp, do_get_msr_feature, 1); + r = msr_io(NULL, argp, do_get_feature_msr, 1); break; #ifdef CONFIG_KVM_HYPERV case KVM_GET_SUPPORTED_HV_CPUID: @@ -7382,7 +7382,7 @@ static void kvm_probe_feature_msr(u32 msr_index) .index = msr_index, }; - if (kvm_get_msr_feature(&msr)) + if (kvm_get_feature_msr(&msr)) return; msr_based_features[num_msr_based_features++] = msr_index; From 7075f163615072a74bae7c5344210adec5f9ea2a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 11:19:31 -0700 Subject: [PATCH 045/352] KVM: x86: Refactor kvm_get_feature_msr() to avoid struct kvm_msr_entry Refactor kvm_get_feature_msr() to take the components of kvm_msr_entry as separate parameters, along with a vCPU pointer, i.e. to give it the same prototype as kvm_{g,s}et_msr_ignored_check(). This will allow using a common inner helper for handling accesses to "regular" and feature MSRs. No functional change intended. Link: https://lore.kernel.org/r/20240802181935.292540-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3efe086bea4cc..0c2fa6c590a23 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1659,39 +1659,38 @@ static u64 kvm_get_arch_capabilities(void) return data; } -static int kvm_get_feature_msr(struct kvm_msr_entry *msr) +static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) { - switch (msr->index) { + WARN_ON_ONCE(!host_initiated); + + switch (index) { case MSR_IA32_ARCH_CAPABILITIES: - msr->data = kvm_get_arch_capabilities(); + *data = kvm_get_arch_capabilities(); break; case MSR_IA32_PERF_CAPABILITIES: - msr->data = kvm_caps.supported_perf_cap; + *data = kvm_caps.supported_perf_cap; break; case MSR_IA32_UCODE_REV: - rdmsrl_safe(msr->index, &msr->data); + rdmsrl_safe(index, data); break; default: - return kvm_x86_call(get_feature_msr)(msr->index, &msr->data); + return kvm_x86_call(get_feature_msr)(index, data); } return 0; } static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - struct kvm_msr_entry msr; int r; /* Unconditionally clear the output for simplicity */ - msr.data = 0; - msr.index = index; - r = kvm_get_feature_msr(&msr); + *data = 0; + r = kvm_get_feature_msr(vcpu, index, data, true); if (r == KVM_MSR_RET_UNSUPPORTED && kvm_msr_ignored_check(index, 0, false)) r = 0; - *data = msr.data; - return r; } @@ -7378,11 +7377,9 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) static void kvm_probe_feature_msr(u32 msr_index) { - struct kvm_msr_entry msr = { - .index = msr_index, - }; + u64 data; - if (kvm_get_feature_msr(&msr)) + if (kvm_get_feature_msr(NULL, msr_index, &data, true)) return; msr_based_features[num_msr_based_features++] = msr_index; From 1cec2034980ad03ebf8ce0f187a8f3101c33e611 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 11:19:32 -0700 Subject: [PATCH 046/352] KVM: x86: Funnel all fancy MSR return value handling into a common helper Add a common helper, kvm_do_msr_access(), to invoke the "leaf" APIs that are type and access specific, and more importantly to handle errors that are returned from the leaf APIs. I.e. turn kvm_msr_ignored_check() from a a helper that is called on an error, into a trampoline that detects errors *and* applies relevant side effects, e.g. logging unimplemented accesses. Because the leaf APIs are used for guest accesses, userspace accesses, and KVM accesses, and because KVM supports restricting access to MSRs from userspace via filters, the error handling is subtly non-trivial. E.g. KVM has had at least one bug escape due to making each "outer" function handle errors. See commit 3376ca3f1a20 ("KVM: x86: Fix KVM_GET_MSRS stack info leak"). Link: https://lore.kernel.org/r/20240802181935.292540-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 84 +++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0c2fa6c590a23..faf1ba786a341 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -304,25 +304,40 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { static struct kmem_cache *x86_emulator_cache; -/* - * When called, it means the previous get/set msr reached an invalid msr. - * Return true if we want to ignore/silent this failed msr access. - */ -static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) +typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated); + +static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr, + u64 *data, bool host_initiated, + enum kvm_msr_access rw, + msr_access_t msr_access_fn) { - const char *op = write ? "wrmsr" : "rdmsr"; + const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr"; + int ret; - if (ignore_msrs) { - if (report_ignored_msrs) - kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", - op, msr, data); - /* Mask the error */ - return true; - } else { + BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W); + + /* + * Zero the data on read failures to avoid leaking stack data to the + * guest and/or userspace, e.g. if the failure is ignored below. + */ + ret = msr_access_fn(vcpu, msr, data, host_initiated); + if (ret && rw == MSR_TYPE_R) + *data = 0; + + if (ret != KVM_MSR_RET_UNSUPPORTED) + return ret; + + if (!ignore_msrs) { kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", - op, msr, data); - return false; + op, msr, *data); + return ret; } + + if (report_ignored_msrs) + kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data); + + return 0; } static struct kmem_cache *kvm_alloc_emulator_cache(void) @@ -1682,16 +1697,8 @@ static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - int r; - - /* Unconditionally clear the output for simplicity */ - *data = 0; - r = kvm_get_feature_msr(vcpu, index, data, true); - - if (r == KVM_MSR_RET_UNSUPPORTED && kvm_msr_ignored_check(index, 0, false)) - r = 0; - - return r; + return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R, + kvm_get_feature_msr); } static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) @@ -1878,16 +1885,17 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, return kvm_x86_call(set_msr)(vcpu, &msr); } +static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) +{ + return __kvm_set_msr(vcpu, index, *data, host_initiated); +} + static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 data, bool host_initiated) { - int ret = __kvm_set_msr(vcpu, index, data, host_initiated); - - if (ret == KVM_MSR_RET_UNSUPPORTED) - if (kvm_msr_ignored_check(index, data, true)) - ret = 0; - - return ret; + return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W, + _kvm_set_msr); } /* @@ -1926,16 +1934,8 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated) { - int ret = __kvm_get_msr(vcpu, index, data, host_initiated); - - if (ret == KVM_MSR_RET_UNSUPPORTED) { - /* Unconditionally clear *data for simplicity */ - *data = 0; - if (kvm_msr_ignored_check(index, 0, false)) - ret = 0; - } - - return ret; + return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R, + __kvm_get_msr); } static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) From 3adef9034596a8ba6415e6e460209cd9fc524e81 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 11:19:33 -0700 Subject: [PATCH 047/352] KVM: x86: Hoist x86.c's global msr_* variables up above kvm_do_msr_access() Move the definitions of the various MSR arrays above kvm_do_msr_access() so that kvm_do_msr_access() can query the arrays when handling failures, e.g. to squash errors if userspace tries to read an MSR that isn't fully supported, but that KVM advertised as being an MSR-to-save. No functional change intended. Link: https://lore.kernel.org/r/20240802181935.292540-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 368 ++++++++++++++++++++++----------------------- 1 file changed, 184 insertions(+), 184 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index faf1ba786a341..92cb4aa7efd7f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -304,6 +304,190 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { static struct kmem_cache *x86_emulator_cache; +/* + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track + * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, + * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that + * require host support, i.e. should be probed via RDMSR. emulated_msrs holds + * MSRs that KVM emulates without strictly requiring host support. + * msr_based_features holds MSRs that enumerate features, i.e. are effectively + * CPUID leafs. Note, msr_based_features isn't mutually exclusive with + * msrs_to_save and emulated_msrs. + */ + +static const u32 msrs_to_save_base[] = { + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_STAR, +#ifdef CONFIG_X86_64 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, +#endif + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, + MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, + MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, + MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, + MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, + MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, + MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, + MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, + MSR_IA32_UMWAIT_CONTROL, + + MSR_IA32_XFD, MSR_IA32_XFD_ERR, +}; + +static const u32 msrs_to_save_pmu[] = { + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, + MSR_CORE_PERF_GLOBAL_CTRL, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, + + /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */ + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, + + MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, + MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, + + /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */ + MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, + MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, + MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, + MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, + + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, +}; + +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + + ARRAY_SIZE(msrs_to_save_pmu)]; +static unsigned num_msrs_to_save; + +static const u32 emulated_msrs_all[] = { + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, + +#ifdef CONFIG_KVM_HYPERV + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, + HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, + HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, + HV_X64_MSR_RESET, + HV_X64_MSR_VP_INDEX, + HV_X64_MSR_VP_RUNTIME, + HV_X64_MSR_SCONTROL, + HV_X64_MSR_STIMER0_CONFIG, + HV_X64_MSR_VP_ASSIST_PAGE, + HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, + HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, + HV_X64_MSR_SYNDBG_OPTIONS, + HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, + HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, + HV_X64_MSR_SYNDBG_PENDING_BUFFER, +#endif + + MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, + + MSR_IA32_TSC_ADJUST, + MSR_IA32_TSC_DEADLINE, + MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, + MSR_IA32_MISC_ENABLE, + MSR_IA32_MCG_STATUS, + MSR_IA32_MCG_CTL, + MSR_IA32_MCG_EXT_CTL, + MSR_IA32_SMBASE, + MSR_SMI_COUNT, + MSR_PLATFORM_INFO, + MSR_MISC_FEATURES_ENABLES, + MSR_AMD64_VIRT_SPEC_CTRL, + MSR_AMD64_TSC_RATIO, + MSR_IA32_POWER_CTL, + MSR_IA32_UCODE_REV, + + /* + * KVM always supports the "true" VMX control MSRs, even if the host + * does not. The VMX MSRs as a whole are considered "emulated" as KVM + * doesn't strictly require them to exist in the host (ignoring that + * KVM would refuse to load in the first place if the core set of MSRs + * aren't supported). + */ + MSR_IA32_VMX_BASIC, + MSR_IA32_VMX_TRUE_PINBASED_CTLS, + MSR_IA32_VMX_TRUE_PROCBASED_CTLS, + MSR_IA32_VMX_TRUE_EXIT_CTLS, + MSR_IA32_VMX_TRUE_ENTRY_CTLS, + MSR_IA32_VMX_MISC, + MSR_IA32_VMX_CR0_FIXED0, + MSR_IA32_VMX_CR4_FIXED0, + MSR_IA32_VMX_VMCS_ENUM, + MSR_IA32_VMX_PROCBASED_CTLS2, + MSR_IA32_VMX_EPT_VPID_CAP, + MSR_IA32_VMX_VMFUNC, + + MSR_K7_HWCR, + MSR_KVM_POLL_CONTROL, +}; + +static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; +static unsigned num_emulated_msrs; + +/* + * List of MSRs that control the existence of MSR-based features, i.e. MSRs + * that are effectively CPUID leafs. VMX MSRs are also included in the set of + * feature MSRs, but are handled separately to allow expedited lookups. + */ +static const u32 msr_based_features_all_except_vmx[] = { + MSR_AMD64_DE_CFG, + MSR_IA32_UCODE_REV, + MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, +}; + +static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + + (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; +static unsigned int num_msr_based_features; + +/* + * All feature MSRs except uCode revID, which tracks the currently loaded uCode + * patch, are immutable once the vCPU model is defined. + */ +static bool kvm_is_immutable_feature_msr(u32 msr) +{ + int i; + + if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) + return true; + + for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { + if (msr == msr_based_features_all_except_vmx[i]) + return msr != MSR_IA32_UCODE_REV; + } + + return false; +} + +static bool kvm_is_msr_to_save(u32 msr_index) +{ + unsigned int i; + + for (i = 0; i < num_msrs_to_save; i++) { + if (msrs_to_save[i] == msr_index) + return true; + } + + return false; +} + typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated); @@ -1425,178 +1609,6 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); -/* - * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track - * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, - * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that - * require host support, i.e. should be probed via RDMSR. emulated_msrs holds - * MSRs that KVM emulates without strictly requiring host support. - * msr_based_features holds MSRs that enumerate features, i.e. are effectively - * CPUID leafs. Note, msr_based_features isn't mutually exclusive with - * msrs_to_save and emulated_msrs. - */ - -static const u32 msrs_to_save_base[] = { - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_STAR, -#ifdef CONFIG_X86_64 - MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, -#endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, - MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, - MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, - MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, - MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, - MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, - MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, - MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, - MSR_IA32_UMWAIT_CONTROL, - - MSR_IA32_XFD, MSR_IA32_XFD_ERR, -}; - -static const u32 msrs_to_save_pmu[] = { - MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, - MSR_ARCH_PERFMON_FIXED_CTR0 + 2, - MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, - MSR_CORE_PERF_GLOBAL_CTRL, - MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, - - /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */ - MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, - MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, - MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, - MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, - MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, - MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, - MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, - MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, - - MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, - MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, - - /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */ - MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, - MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, - MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, - MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, - - MSR_AMD64_PERF_CNTR_GLOBAL_CTL, - MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, - MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, -}; - -static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + - ARRAY_SIZE(msrs_to_save_pmu)]; -static unsigned num_msrs_to_save; - -static const u32 emulated_msrs_all[] = { - MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, - -#ifdef CONFIG_KVM_HYPERV - HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, - HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, - HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, - HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, - HV_X64_MSR_RESET, - HV_X64_MSR_VP_INDEX, - HV_X64_MSR_VP_RUNTIME, - HV_X64_MSR_SCONTROL, - HV_X64_MSR_STIMER0_CONFIG, - HV_X64_MSR_VP_ASSIST_PAGE, - HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, - HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, - HV_X64_MSR_SYNDBG_OPTIONS, - HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, - HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, - HV_X64_MSR_SYNDBG_PENDING_BUFFER, -#endif - - MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, - - MSR_IA32_TSC_ADJUST, - MSR_IA32_TSC_DEADLINE, - MSR_IA32_ARCH_CAPABILITIES, - MSR_IA32_PERF_CAPABILITIES, - MSR_IA32_MISC_ENABLE, - MSR_IA32_MCG_STATUS, - MSR_IA32_MCG_CTL, - MSR_IA32_MCG_EXT_CTL, - MSR_IA32_SMBASE, - MSR_SMI_COUNT, - MSR_PLATFORM_INFO, - MSR_MISC_FEATURES_ENABLES, - MSR_AMD64_VIRT_SPEC_CTRL, - MSR_AMD64_TSC_RATIO, - MSR_IA32_POWER_CTL, - MSR_IA32_UCODE_REV, - - /* - * KVM always supports the "true" VMX control MSRs, even if the host - * does not. The VMX MSRs as a whole are considered "emulated" as KVM - * doesn't strictly require them to exist in the host (ignoring that - * KVM would refuse to load in the first place if the core set of MSRs - * aren't supported). - */ - MSR_IA32_VMX_BASIC, - MSR_IA32_VMX_TRUE_PINBASED_CTLS, - MSR_IA32_VMX_TRUE_PROCBASED_CTLS, - MSR_IA32_VMX_TRUE_EXIT_CTLS, - MSR_IA32_VMX_TRUE_ENTRY_CTLS, - MSR_IA32_VMX_MISC, - MSR_IA32_VMX_CR0_FIXED0, - MSR_IA32_VMX_CR4_FIXED0, - MSR_IA32_VMX_VMCS_ENUM, - MSR_IA32_VMX_PROCBASED_CTLS2, - MSR_IA32_VMX_EPT_VPID_CAP, - MSR_IA32_VMX_VMFUNC, - - MSR_K7_HWCR, - MSR_KVM_POLL_CONTROL, -}; - -static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; -static unsigned num_emulated_msrs; - -/* - * List of MSRs that control the existence of MSR-based features, i.e. MSRs - * that are effectively CPUID leafs. VMX MSRs are also included in the set of - * feature MSRs, but are handled separately to allow expedited lookups. - */ -static const u32 msr_based_features_all_except_vmx[] = { - MSR_AMD64_DE_CFG, - MSR_IA32_UCODE_REV, - MSR_IA32_ARCH_CAPABILITIES, - MSR_IA32_PERF_CAPABILITIES, -}; - -static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + - (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; -static unsigned int num_msr_based_features; - -/* - * All feature MSRs except uCode revID, which tracks the currently loaded uCode - * patch, are immutable once the vCPU model is defined. - */ -static bool kvm_is_immutable_feature_msr(u32 msr) -{ - int i; - - if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) - return true; - - for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { - if (msr == msr_based_features_all_except_vmx[i]) - return msr != MSR_IA32_UCODE_REV; - } - - return false; -} - /* * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM * does not yet virtualize. These include: @@ -3744,18 +3756,6 @@ static void record_steal_time(struct kvm_vcpu *vcpu) mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } -static bool kvm_is_msr_to_save(u32 msr_index) -{ - unsigned int i; - - for (i = 0; i < num_msrs_to_save; i++) { - if (msrs_to_save[i] == msr_index) - return true; - } - - return false; -} - int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u32 msr = msr_info->index; From 64a5d7a1091ff6ee70d2155b9dccfe8107d35ffa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 11:19:34 -0700 Subject: [PATCH 048/352] KVM: x86: Suppress failures on userspace access to advertised, unsupported MSRs Extend KVM's suppression of failures due to a userspace access to an unsupported, but advertised as a "to save" MSR to all MSRs, not just those that happen to reach the default case statements in kvm_get_msr_common() and kvm_set_msr_common(). KVM's soon-to-be-established ABI is that if an MSR is advertised to userspace, then userspace is allowed to read the MSR, and write back the value that was read, i.e. why an MSR is unsupported doesn't change KVM's ABI. Practically speaking, this is very nearly a nop, as the only other paths that return KVM_MSR_RET_UNSUPPORTED are {svm,vmx}_get_feature_msr(), and it's unlikely, though not impossible, that userspace is using KVM_GET_MSRS on unsupported MSRs. The primary goal of moving the suppression to common code is to allow returning KVM_MSR_RET_UNSUPPORTED as appropriate throughout KVM, without having to manually handle the "is userspace accessing an advertised" waiver. I.e. this will allow formalizing KVM's ABI without incurring a high maintenance cost. Link: https://lore.kernel.org/r/20240802181935.292540-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 92cb4aa7efd7f..1762bde488a2d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -512,6 +512,15 @@ static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr, if (ret != KVM_MSR_RET_UNSUPPORTED) return ret; + /* + * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM + * reports as to-be-saved, even if an MSR isn't fully supported. + * Simply check that @data is '0', which covers both the write '0' case + * and all reads (in which case @data is zeroed on failure; see above). + */ + if (host_initiated && !*data && kvm_is_msr_to_save(msr)) + return 0; + if (!ignore_msrs) { kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", op, msr, *data); @@ -4137,14 +4146,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); - /* - * Userspace is allowed to write '0' to MSRs that KVM reports - * as to-be-saved, even if an MSRs isn't fully supported. - */ - if (msr_info->host_initiated && !data && - kvm_is_msr_to_save(msr)) - break; - return KVM_MSR_RET_UNSUPPORTED; } return 0; @@ -4496,16 +4497,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); - /* - * Userspace is allowed to read MSRs that KVM reports as - * to-be-saved, even if an MSR isn't fully supported. - */ - if (msr_info->host_initiated && - kvm_is_msr_to_save(msr_info->index)) { - msr_info->data = 0; - break; - } - return KVM_MSR_RET_UNSUPPORTED; } return 0; From 44dd0f5732b466a6e4d2a9b3aad1678f43f061af Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 11:19:35 -0700 Subject: [PATCH 049/352] KVM: x86: Suppress userspace access failures on unsupported, "emulated" MSRs Extend KVM's suppression of userspace MSR access failures to MSRs that KVM reports as emulated, but are ultimately unsupported, e.g. if the VMX MSRs are emulated by KVM, but are unsupported given the vCPU model. Suggested-by: Weijiang Yang Reviewed-by: Weijiang Yang Link: https://lore.kernel.org/r/20240802181935.292540-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1762bde488a2d..00e7927250525 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -476,7 +476,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr) return false; } -static bool kvm_is_msr_to_save(u32 msr_index) +static bool kvm_is_advertised_msr(u32 msr_index) { unsigned int i; @@ -485,6 +485,11 @@ static bool kvm_is_msr_to_save(u32 msr_index) return true; } + for (i = 0; i < num_emulated_msrs; i++) { + if (emulated_msrs[i] == msr_index) + return true; + } + return false; } @@ -514,11 +519,11 @@ static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr, /* * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM - * reports as to-be-saved, even if an MSR isn't fully supported. + * advertises to userspace, even if an MSR isn't fully supported. * Simply check that @data is '0', which covers both the write '0' case * and all reads (in which case @data is zeroed on failure; see above). */ - if (host_initiated && !*data && kvm_is_msr_to_save(msr)) + if (host_initiated && !*data && kvm_is_advertised_msr(msr)) return 0; if (!ignore_msrs) { From 24a7e944966cc8a285e6581dcc98edebeee76c97 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 16 Aug 2024 15:01:38 +0200 Subject: [PATCH 050/352] KVM: selftests: Move Hyper-V specific functions out of processor.c Since there is 'hyperv.c' for Hyper-V specific functions already, move Hyper-V specific functions out of processor.c there. No functional change intended. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240816130139.286246-2-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86_64/hyperv.h | 4 ++ .../selftests/kvm/include/x86_64/processor.h | 7 ++- .../testing/selftests/kvm/lib/x86_64/hyperv.c | 59 ++++++++++++++++++ .../selftests/kvm/lib/x86_64/processor.c | 61 ------------------- .../selftests/kvm/x86_64/xen_vmcall_test.c | 1 + 5 files changed, 68 insertions(+), 64 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h index fa65b908b13e5..a2e7cf7ee0ad1 100644 --- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h +++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h @@ -343,4 +343,8 @@ struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm, /* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */ #define HV_INVARIANT_TSC_EXPOSED BIT_ULL(0) +const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); +const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu); +void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu); + #endif /* !SELFTEST_KVM_HYPERV_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index a0c1440017bb3..e247f99e0473d 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -25,6 +25,10 @@ extern bool host_cpu_is_intel; extern bool host_cpu_is_amd; extern uint64_t guest_tsc_khz; +#ifndef MAX_NR_CPUID_ENTRIES +#define MAX_NR_CPUID_ENTRIES 100 +#endif + /* Forced emulation prefix, used to invoke the emulator unconditionally. */ #define KVM_FEP "ud2; .byte 'k', 'v', 'm';" @@ -908,8 +912,6 @@ static inline void vcpu_xcrs_set(struct kvm_vcpu *vcpu, struct kvm_xcrs *xcrs) const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid, uint32_t function, uint32_t index); const struct kvm_cpuid2 *kvm_get_supported_cpuid(void); -const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); -const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu); static inline uint32_t kvm_cpu_fms(void) { @@ -1009,7 +1011,6 @@ static inline struct kvm_cpuid2 *allocate_kvm_cpuid2(int nr_entries) } void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid); -void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu); static inline struct kvm_cpuid_entry2 *__vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function, diff --git a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c index efb7e7a1354dc..b4a5e4ad71054 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c +++ b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c @@ -8,6 +8,65 @@ #include "processor.h" #include "hyperv.h" +const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) +{ + static struct kvm_cpuid2 *cpuid; + int kvm_fd; + + if (cpuid) + return cpuid; + + cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); + kvm_fd = open_kvm_dev_path_or_exit(); + + kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); + + close(kvm_fd); + return cpuid; +} + +void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu) +{ + static struct kvm_cpuid2 *cpuid_full; + const struct kvm_cpuid2 *cpuid_sys, *cpuid_hv; + int i, nent = 0; + + if (!cpuid_full) { + cpuid_sys = kvm_get_supported_cpuid(); + cpuid_hv = kvm_get_supported_hv_cpuid(); + + cpuid_full = allocate_kvm_cpuid2(cpuid_sys->nent + cpuid_hv->nent); + if (!cpuid_full) { + perror("malloc"); + abort(); + } + + /* Need to skip KVM CPUID leaves 0x400000xx */ + for (i = 0; i < cpuid_sys->nent; i++) { + if (cpuid_sys->entries[i].function >= 0x40000000 && + cpuid_sys->entries[i].function < 0x40000100) + continue; + cpuid_full->entries[nent] = cpuid_sys->entries[i]; + nent++; + } + + memcpy(&cpuid_full->entries[nent], cpuid_hv->entries, + cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2)); + cpuid_full->nent = nent + cpuid_hv->nent; + } + + vcpu_init_cpuid(vcpu, cpuid_full); +} + +const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); + + vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid); + + return cpuid; +} + struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm, vm_vaddr_t *p_hv_pages_gva) { diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 153739f2e201f..7876f052ca39f 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -19,8 +19,6 @@ #define KERNEL_DS 0x10 #define KERNEL_TSS 0x18 -#define MAX_NR_CPUID_ENTRIES 100 - vm_vaddr_t exception_handlers; bool host_cpu_is_amd; bool host_cpu_is_intel; @@ -1195,65 +1193,6 @@ void xen_hypercall(uint64_t nr, uint64_t a0, void *a1) GUEST_ASSERT(!__xen_hypercall(nr, a0, a1)); } -const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) -{ - static struct kvm_cpuid2 *cpuid; - int kvm_fd; - - if (cpuid) - return cpuid; - - cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); - kvm_fd = open_kvm_dev_path_or_exit(); - - kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); - - close(kvm_fd); - return cpuid; -} - -void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu) -{ - static struct kvm_cpuid2 *cpuid_full; - const struct kvm_cpuid2 *cpuid_sys, *cpuid_hv; - int i, nent = 0; - - if (!cpuid_full) { - cpuid_sys = kvm_get_supported_cpuid(); - cpuid_hv = kvm_get_supported_hv_cpuid(); - - cpuid_full = allocate_kvm_cpuid2(cpuid_sys->nent + cpuid_hv->nent); - if (!cpuid_full) { - perror("malloc"); - abort(); - } - - /* Need to skip KVM CPUID leaves 0x400000xx */ - for (i = 0; i < cpuid_sys->nent; i++) { - if (cpuid_sys->entries[i].function >= 0x40000000 && - cpuid_sys->entries[i].function < 0x40000100) - continue; - cpuid_full->entries[nent] = cpuid_sys->entries[i]; - nent++; - } - - memcpy(&cpuid_full->entries[nent], cpuid_hv->entries, - cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2)); - cpuid_full->nent = nent + cpuid_hv->nent; - } - - vcpu_init_cpuid(vcpu, cpuid_full); -} - -const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); - - vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid); - - return cpuid; -} - unsigned long vm_compute_max_gfn(struct kvm_vm *vm) { const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */ diff --git a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c index e149d0574961c..2585087cdf5cb 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c @@ -10,6 +10,7 @@ #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "hyperv.h" #define HCALL_REGION_GPA 0xc0000000ULL #define HCALL_REGION_SLOT 10 From d8414067cc17bd2070e6667763124754e2932251 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 16 Aug 2024 15:01:39 +0200 Subject: [PATCH 051/352] KVM: selftests: Re-enable hyperv_evmcs/hyperv_svm_test on bare metal KVM_CAP_HYPERV_DIRECT_TLBFLUSH is only reported when KVM runs on top of Hyper-V and hyperv_evmcs/hyperv_svm_test don't need that, these tests check that the feature is properly emulated for Hyper-V on KVM guests. There's no corresponding CAP for that, the feature is reported in KVM_GET_SUPPORTED_HV_CPUID. Hyper-V specific CPUIDs are not reported by KVM_GET_SUPPORTED_CPUID, implement dedicated kvm_hv_cpu_has() helper to do the job. Fixes: 6dac1195181c ("KVM: selftests: Make Hyper-V tests explicitly require KVM Hyper-V support") Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240816130139.286246-3-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/include/x86_64/hyperv.h | 14 ++++++++++++++ tools/testing/selftests/kvm/lib/x86_64/hyperv.c | 8 ++++++++ tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c | 2 +- .../testing/selftests/kvm/x86_64/hyperv_svm_test.c | 2 +- 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h index a2e7cf7ee0ad1..6849e2552f1bf 100644 --- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h +++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h @@ -186,6 +186,18 @@ #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED \ KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 14) +/* HYPERV_CPUID_NESTED_FEATURES.EAX */ +#define HV_X64_NESTED_DIRECT_FLUSH \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 17) +#define HV_X64_NESTED_GUEST_MAPPING_FLUSH \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 18) +#define HV_X64_NESTED_MSR_BITMAP \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 19) + +/* HYPERV_CPUID_NESTED_FEATURES.EBX */ +#define HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EBX, 0) + /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */ #define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING \ KVM_X86_CPU_FEATURE(HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0, EAX, 1) @@ -347,4 +359,6 @@ const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu); void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu); +bool kvm_hv_cpu_has(struct kvm_x86_cpu_feature feature); + #endif /* !SELFTEST_KVM_HYPERV_H */ diff --git a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c index b4a5e4ad71054..15bc8cd583aa4 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c +++ b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c @@ -67,6 +67,14 @@ const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) return cpuid; } +bool kvm_hv_cpu_has(struct kvm_x86_cpu_feature feature) +{ + if (!kvm_has_cap(KVM_CAP_SYS_HYPERV_CPUID)) + return false; + + return kvm_cpuid_has(kvm_get_supported_hv_cpuid(), feature); +} + struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm, vm_vaddr_t *p_hv_pages_gva) { diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c index e192720bfe14b..74cf196613090 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c @@ -242,7 +242,7 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)); - TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_DIRECT_TLBFLUSH)); + TEST_REQUIRE(kvm_hv_cpu_has(HV_X64_NESTED_DIRECT_FLUSH)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index b987a3d797153..0ddb63229bcbb 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -157,7 +157,7 @@ int main(int argc, char *argv[]) int stage; TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); - TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_DIRECT_TLBFLUSH)); + TEST_REQUIRE(kvm_hv_cpu_has(HV_X64_NESTED_DIRECT_FLUSH)); /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, guest_code); From 92f6d4130497f2bedfaf86a4e46e890cf8983307 Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Thu, 18 Jul 2024 20:35:38 +0100 Subject: [PATCH 052/352] KVM: Fix coalesced_mmio_has_room() to avoid premature userspace exit The following calculation used in coalesced_mmio_has_room() to check whether the ring buffer is full is wrong and results in premature exits if the start of the valid entries is in the first half of the ring buffer. avail = (ring->first - last - 1) % KVM_COALESCED_MMIO_MAX; if (avail == 0) /* full */ Because negative values are handled using two's complement, and KVM computes the result as an unsigned value, the above will get a false positive if "first < last" and the ring is half-full. The above might have worked as expected in python for example: >>> (-86) % 170 84 However it doesn't work the same way in C. printf("avail: %d\n", (-86) % 170); printf("avail: %u\n", (-86) % 170); printf("avail: %u\n", (-86u) % 170u); Using gcc-11 these print: avail: -86 avail: 4294967210 avail: 0 For illustration purposes, given a 4-bit integer and a ring size of 0xA (unsigned), 0xA == 0x1010 == -6, and thus (-6u % 0xA) == 0. Fix the calculation and allow all but one entries in the buffer to be used as originally intended. Note, KVM's behavior is self-healing to some extent, as KVM will allow the entire buffer to be used if ring->first is beyond the halfway point. In other words, in the unlikely scenario that a use case benefits from being able to coalesce more than 86 entries at once, KVM will still provide such behavior, sometimes. Note #2, the % operator in C is not the modulo operator but the remainder operator. Modulo and remainder operators differ with respect to negative values. But, the relevant values in KVM are all unsigned, so it's a moot point in this case anyway. Note #3, this is almost a pure revert of the buggy commit, plus a READ_ONCE() to provide additional safety. Thue buggy commit justified the change with "it paves the way for making this function lockless", but it's not at all clear what was intended, nor is there any evidence that the buggy code was somehow safer. (a) the fields in question were already accessed locklessly, from the perspective that they could be modified by userspace at any time, and (b) the lock guarding the ring itself was changed, but never dropped, i.e. whatever lockless scheme (SRCU?) was planned never landed. Fixes: 105f8d40a737 ("KVM: Calculate available entries in coalesced mmio ring") Signed-off-by: Ilias Stamatis Reviewed-by: Paul Durrant Link: https://lore.kernel.org/r/20240718193543.624039-2-ilstam@amazon.com [sean: rework changelog to clarify behavior, call out weirdness of buggy commit] Signed-off-by: Sean Christopherson --- virt/kvm/coalesced_mmio.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 1b90acb6e3fef..184c5c40c9c19 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -43,7 +43,6 @@ static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev, u32 last) { struct kvm_coalesced_mmio_ring *ring; - unsigned avail; /* Are we able to batch it ? */ @@ -52,8 +51,7 @@ static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev, u32 last) * there is always one unused entry in the buffer */ ring = dev->kvm->coalesced_mmio_ring; - avail = (ring->first - last - 1) % KVM_COALESCED_MMIO_MAX; - if (avail == 0) { + if ((last + 1) % KVM_COALESCED_MMIO_MAX == READ_ONCE(ring->first)) { /* full */ return 0; } From 9a948c0c8e741776ee6d938b49d34d22034fbb7d Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Wed, 14 Aug 2024 11:34:15 +0800 Subject: [PATCH 053/352] ceph: Remove unused declarations These functions is never implemented and used. Signed-off-by: Yue Haibing Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.h | 3 --- fs/ceph/super.h | 2 -- include/linux/ceph/osd_client.h | 2 -- 3 files changed, 7 deletions(-) diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 9bcc7f181bfe2..585ab5a6d87d5 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -559,9 +559,6 @@ extern struct ceph_mds_session * ceph_get_mds_session(struct ceph_mds_session *s); extern void ceph_put_mds_session(struct ceph_mds_session *s); -extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, - struct ceph_msg *msg, int mds); - extern int ceph_mdsc_init(struct ceph_fs_client *fsc); extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6e817bf1337c6..c88bf53f68e9f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1056,8 +1056,6 @@ extern int ceph_fill_trace(struct super_block *sb, extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session); -extern int ceph_inode_holds_cap(struct inode *inode, int mask); - extern bool ceph_inode_set_size(struct inode *inode, loff_t size); extern void __ceph_do_pending_vmtruncate(struct inode *inode); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index f66f6aac74f6f..d7941478158cd 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -449,8 +449,6 @@ extern int ceph_osdc_init(struct ceph_osd_client *osdc, extern void ceph_osdc_stop(struct ceph_osd_client *osdc); extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc); -extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, - struct ceph_msg *msg); extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); From 2015716adbd9cac8505ed461f5f330e832be0c84 Mon Sep 17 00:00:00 2001 From: Chen Yufan Date: Thu, 22 Aug 2024 17:55:41 +0800 Subject: [PATCH 054/352] ceph: Convert to use jiffies macro Use time_after_eq macro instead of using jiffies directly to handle wraparound. [ xiubli: adjust the header files order ] Signed-off-by: Chen Yufan Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 808c9c0482768..6561a6cd94de2 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" @@ -4659,7 +4660,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) * slowness doesn't block mdsc delayed work, * preventing send_renew_caps() from running. */ - if (jiffies - loop_start >= 5 * HZ) + if (time_after_eq(jiffies, loop_start + 5 * HZ)) break; } spin_unlock(&mdsc->cap_delay_lock); From ede0b1d30b82829d6bc7924be18c7ae09cb1eb33 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 22 Aug 2024 21:39:04 +0800 Subject: [PATCH 055/352] libceph: use min() to simplify code in ceph_dns_resolve_name() When resolving name in ceph_dns_resolve_name(), the end address of name is determined by the minimum value of delim_p and colon_p. So using min() here is more in line with the context. Signed-off-by: Li Zetao Reviewed-by: Simon Horman Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3c8b78d9c4d1c..d1b5705dc0c64 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1254,7 +1254,7 @@ static int ceph_dns_resolve_name(const char *name, size_t namelen, colon_p = memchr(name, ':', namelen); if (delim_p && colon_p) - end = delim_p < colon_p ? delim_p : colon_p; + end = min(delim_p, colon_p); else if (!delim_p && colon_p) end = colon_p; else { From 907fa79d787f6f762c0afb67b8b02be8ed1f7bd2 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 21 Aug 2024 10:40:21 -0700 Subject: [PATCH 056/352] MAINTAINERS: scale modules with more reviewers We're looking to add Rust module support, and I don't speak Rust yet. The compromise was reached that in order to scale we'd get volunteers committed from the Rust community willing to review both Rust and C code for modules so we can ensure we get proper reviews for both parts of the code and so that we can scale. Add those who have stepped up to help. Acked-by: Sami Tolvanen Acked-by: Petr Pavlu Acked-by: Daniel Gomez Signed-off-by: Luis Chamberlain --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index f328373463b0d..7e2cf251427d2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15454,6 +15454,9 @@ F: include/dt-bindings/clock/mobileye,eyeq5-clk.h MODULE SUPPORT M: Luis Chamberlain +R: Petr Pavlu +R: Sami Tolvanen +R: Daniel Gomez L: linux-modules@vger.kernel.org L: linux-kernel@vger.kernel.org S: Maintained From 71bf395a276f0578d19e0ae137a7d1d816d08e0e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:50:58 -0700 Subject: [PATCH 057/352] KVM: x86: Enforce x2APIC's must-be-zero reserved ICR bits Inject a #GP on a WRMSR(ICR) that attempts to set any reserved bits that are must-be-zero on both Intel and AMD, i.e. any reserved bits other than the BUSY bit, which Intel ignores and basically says is undefined. KVM's xapic_state_test selftest has been fudging the bug since commit 4b88b1a518b3 ("KVM: selftests: Enhance handling WRMSR ICR register in x2APIC mode"), which essentially removed the testcase instead of fixing the bug. WARN if the nodecode path triggers a #GP, as the CPU is supposed to check reserved bits for ICR when it's partially virtualized. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240719235107.3023592-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d77cd3b87e850..c51c9bf6bd54e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2470,7 +2470,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) * maybe-unecessary write, and both are in the noise anyways. */ if (apic_x2apic_mode(apic) && offset == APIC_ICR) - kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR)); + WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR))); else kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); } @@ -3194,8 +3194,21 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) return 0; } +#define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13)) + int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) { + if (data & X2APIC_ICR_RESERVED_BITS) + return 1; + + /* + * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but + * only AMD requires it to be zero, Intel essentially just ignores the + * bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled, + * the CPU performs the reserved bits checks, i.e. the underlying CPU + * behavior will "win". Arbitrarily clear the BUSY bit, as there is no + * sane way to provide consistent behavior with respect to hardware. + */ data &= ~APIC_ICR_BUSY; kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); From d33234342f8b468e719e05649fd26549fb37ef8a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:50:59 -0700 Subject: [PATCH 058/352] KVM: x86: Move x2APIC ICR helper above kvm_apic_write_nodecode() Hoist kvm_x2apic_icr_write() above kvm_apic_write_nodecode() so that a local helper to _read_ the x2APIC ICR can be added and used in the nodecode path without needing a forward declaration. No functional change intended. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240719235107.3023592-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 46 ++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c51c9bf6bd54e..63be07d7c7820 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2453,6 +2453,29 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); +#define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13)) + +int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) +{ + if (data & X2APIC_ICR_RESERVED_BITS) + return 1; + + /* + * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but + * only AMD requires it to be zero, Intel essentially just ignores the + * bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled, + * the CPU performs the reserved bits checks, i.e. the underlying CPU + * behavior will "win". Arbitrarily clear the BUSY bit, as there is no + * sane way to provide consistent behavior with respect to hardware. + */ + data &= ~APIC_ICR_BUSY; + + kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); + kvm_lapic_set_reg64(apic, APIC_ICR, data); + trace_kvm_apic_write(APIC_ICR, data); + return 0; +} + /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { @@ -3194,29 +3217,6 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) return 0; } -#define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13)) - -int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) -{ - if (data & X2APIC_ICR_RESERVED_BITS) - return 1; - - /* - * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but - * only AMD requires it to be zero, Intel essentially just ignores the - * bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled, - * the CPU performs the reserved bits checks, i.e. the underlying CPU - * behavior will "win". Arbitrarily clear the BUSY bit, as there is no - * sane way to provide consistent behavior with respect to hardware. - */ - data &= ~APIC_ICR_BUSY; - - kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); - kvm_lapic_set_reg64(apic, APIC_ICR, data); - trace_kvm_apic_write(APIC_ICR, data); - return 0; -} - static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data) { u32 low; From 73b42dc69be8564d4951a14d00f827929fe5ef79 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:51:00 -0700 Subject: [PATCH 059/352] KVM: x86: Re-split x2APIC ICR into ICR+ICR2 for AMD (x2AVIC) Re-introduce the "split" x2APIC ICR storage that KVM used prior to Intel's IPI virtualization support, but only for AMD. While not stated anywhere in the APM, despite stating the ICR is a single 64-bit register, AMD CPUs store the 64-bit ICR as two separate 32-bit values in ICR and ICR2. When IPI virtualization (IPIv on Intel, all AVIC flavors on AMD) is enabled, KVM needs to match CPU behavior as some ICR ICR writes will be handled by the CPU, not by KVM. Add a kvm_x86_ops knob to control the underlying format used by the CPU to store the x2APIC ICR, and tune it to AMD vs. Intel regardless of whether or not x2AVIC is enabled. If KVM is handling all ICR writes, the storage format for x2APIC mode doesn't matter, and having the behavior follow AMD versus Intel will provide better test coverage and ease debugging. Fixes: 4d1d7942e36a ("KVM: SVM: Introduce logic to (de)activate x2AVIC mode") Cc: stable@vger.kernel.org Cc: Maxim Levitsky Cc: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20240719235107.3023592-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/lapic.c | 42 +++++++++++++++++++++++---------- arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/vmx/main.c | 2 ++ 4 files changed, 36 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 95396e4cb3dad..f9dfb2d62053a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1727,6 +1727,8 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); + + const bool x2apic_icr_is_split; const unsigned long required_apicv_inhibits; bool allow_apicv_in_x2apic_without_x2apic_virtualization; void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 63be07d7c7820..c7180cb5f4640 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2471,11 +2471,25 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) data &= ~APIC_ICR_BUSY; kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); - kvm_lapic_set_reg64(apic, APIC_ICR, data); + if (kvm_x86_ops.x2apic_icr_is_split) { + kvm_lapic_set_reg(apic, APIC_ICR, data); + kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32); + } else { + kvm_lapic_set_reg64(apic, APIC_ICR, data); + } trace_kvm_apic_write(APIC_ICR, data); return 0; } +static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic) +{ + if (kvm_x86_ops.x2apic_icr_is_split) + return (u64)kvm_lapic_get_reg(apic, APIC_ICR) | + (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32; + + return kvm_lapic_get_reg64(apic, APIC_ICR); +} + /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { @@ -2493,7 +2507,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) * maybe-unecessary write, and both are in the noise anyways. */ if (apic_x2apic_mode(apic) && offset == APIC_ICR) - WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR))); + WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic))); else kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); } @@ -3013,18 +3027,22 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, /* * In x2APIC mode, the LDR is fixed and based on the id. And - * ICR is internally a single 64-bit register, but needs to be - * split to ICR+ICR2 in userspace for backwards compatibility. + * if the ICR is _not_ split, ICR is internally a single 64-bit + * register, but needs to be split to ICR+ICR2 in userspace for + * backwards compatibility. */ - if (set) { + if (set) *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id); - icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | - (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; - __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); - } else { - icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); - __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); + if (!kvm_x86_ops.x2apic_icr_is_split) { + if (set) { + icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | + (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; + __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); + } else { + icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); + __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); + } } } @@ -3222,7 +3240,7 @@ static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data) u32 low; if (reg == APIC_ICR) { - *data = kvm_lapic_get_reg64(apic, APIC_ICR); + *data = kvm_x2apic_icr_read(apic); return 0; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d8cfe8f233276..eb3de01602b96 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5053,6 +5053,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, + + .x2apic_icr_is_split = true, .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, .apicv_post_state_restore = avic_apicv_post_state_restore, diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 4f6023a0deb34..0a094ebad4b15 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -89,6 +89,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .enable_nmi_window = vmx_enable_nmi_window, .enable_irq_window = vmx_enable_irq_window, .update_cr8_intercept = vmx_update_cr8_intercept, + + .x2apic_icr_is_split = false, .set_virtual_apic_mode = vmx_set_virtual_apic_mode, .set_apic_access_page_addr = vmx_set_apic_access_page_addr, .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, From d1c2cdca5a08f422b791670c11f9d4e3ed0a5518 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:51:01 -0700 Subject: [PATCH 060/352] KVM: selftests: Open code vcpu_run() equivalent in guest_printf test Open code a version of vcpu_run() in the guest_printf test in anticipation of adding UCALL_ABORT handling to _vcpu_run(). The guest_printf test intentionally generates asserts to verify the output, and thus needs to bypass common assert handling. Open code a helper in the guest_printf test, as it's not expected that any other test would want to skip _only_ the UCALL_ABORT handling. Link: https://lore.kernel.org/r/20240719235107.3023592-5-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/guest_print_test.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/guest_print_test.c b/tools/testing/selftests/kvm/guest_print_test.c index 8092c2d0f5d68..bcf582852db99 100644 --- a/tools/testing/selftests/kvm/guest_print_test.c +++ b/tools/testing/selftests/kvm/guest_print_test.c @@ -107,6 +107,21 @@ static void ucall_abort(const char *assert_msg, const char *expected_assert_msg) expected_assert_msg, &assert_msg[offset]); } +/* + * Open code vcpu_run(), sans the UCALL_ABORT handling, so that intentional + * guest asserts guest can be verified instead of being reported as failures. + */ +static void do_vcpu_run(struct kvm_vcpu *vcpu) +{ + int r; + + do { + r = __vcpu_run(vcpu); + } while (r == -1 && errno == EINTR); + + TEST_ASSERT(!r, KVM_IOCTL_ERROR(KVM_RUN, r)); +} + static void run_test(struct kvm_vcpu *vcpu, const char *expected_printf, const char *expected_assert) { @@ -114,7 +129,7 @@ static void run_test(struct kvm_vcpu *vcpu, const char *expected_printf, struct ucall uc; while (1) { - vcpu_run(vcpu); + do_vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, "Unexpected exit reason: %u (%s),", @@ -159,7 +174,7 @@ static void test_limits(void) vm = vm_create_with_one_vcpu(&vcpu, guest_code_limits); run = vcpu->run; - vcpu_run(vcpu); + do_vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, "Unexpected exit reason: %u (%s),", From ed24ba6c2c3450c96dcd804f81893d11e52463cc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:51:02 -0700 Subject: [PATCH 061/352] KVM: selftests: Report unhandled exceptions on x86 as regular guest asserts Now that selftests support printf() in the guest, report unexpected exceptions via the regular assertion framework. Exceptions were special cased purely to provide a better error message. Convert only x86 for now, as it's low-hanging fruit (already formats the assertion in the guest), and converting x86 will allow adding asserts in x86 library code without needing to update multiple tests. Once all other architectures are converted, this will allow moving the reporting to common code, which will in turn allow adding asserts in common library code, and will also allow removing UCALL_UNHANDLED. Link: https://lore.kernel.org/r/20240719235107.3023592-6-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 153739f2e201f..814a604c0891b 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -566,10 +566,8 @@ void route_exception(struct ex_regs *regs) if (kvm_fixup_exception(regs)) return; - ucall_assert(UCALL_UNHANDLED, - "Unhandled exception in guest", __FILE__, __LINE__, - "Unhandled exception '0x%lx' at guest RIP '0x%lx'", - regs->vector, regs->rip); + GUEST_FAIL("Unhandled exception '0x%lx' at guest RIP '0x%lx'", + regs->vector, regs->rip); } static void vm_init_descriptor_tables(struct kvm_vm *vm) @@ -611,7 +609,7 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) { struct ucall uc; - if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) + if (get_ucall(vcpu, &uc) == UCALL_ABORT) REPORT_GUEST_ASSERT(uc); } From f2e91e874179a27d4c29b3f31706b37e1e6bcf54 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:51:03 -0700 Subject: [PATCH 062/352] KVM: selftests: Add x86 helpers to play nice with x2APIC MSR #GPs Add helpers to allow and expect #GP on x2APIC MSRs, and opportunistically have the existing helper spit out a more useful error message if an unexpected exception occurs. Link: https://lore.kernel.org/r/20240719235107.3023592-7-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86_64/apic.h | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/apic.h b/tools/testing/selftests/kvm/include/x86_64/apic.h index 0f268b55fa067..51990094effda 100644 --- a/tools/testing/selftests/kvm/include/x86_64/apic.h +++ b/tools/testing/selftests/kvm/include/x86_64/apic.h @@ -11,6 +11,7 @@ #include #include "processor.h" +#include "ucall_common.h" #define APIC_DEFAULT_GPA 0xfee00000ULL @@ -93,9 +94,27 @@ static inline uint64_t x2apic_read_reg(unsigned int reg) return rdmsr(APIC_BASE_MSR + (reg >> 4)); } +static inline uint8_t x2apic_write_reg_safe(unsigned int reg, uint64_t value) +{ + return wrmsr_safe(APIC_BASE_MSR + (reg >> 4), value); +} + static inline void x2apic_write_reg(unsigned int reg, uint64_t value) { - wrmsr(APIC_BASE_MSR + (reg >> 4), value); + uint8_t fault = x2apic_write_reg_safe(reg, value); + + __GUEST_ASSERT(!fault, "Unexpected fault 0x%x on WRMSR(%x) = %lx\n", + fault, APIC_BASE_MSR + (reg >> 4), value); } +static inline void x2apic_write_reg_fault(unsigned int reg, uint64_t value) +{ + uint8_t fault = x2apic_write_reg_safe(reg, value); + + __GUEST_ASSERT(fault == GP_VECTOR, + "Wanted #GP on WRMSR(%x) = %lx, got 0x%x\n", + APIC_BASE_MSR + (reg >> 4), value, fault); +} + + #endif /* SELFTEST_KVM_APIC_H */ From faf06a238254cec0c8c9bc0876caede63fcfeb24 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:51:04 -0700 Subject: [PATCH 063/352] KVM: selftests: Skip ICR.BUSY test in xapic_state_test if x2APIC is enabled Don't test the ICR BUSY bit when x2APIC is enabled as AMD and Intel have different behavior (AMD #GPs, Intel ignores), and the fact that the CPU performs the reserved bit checks when IPI virtualization is enabled makes it impossible for KVM to precisely emulate one or the other. Link: https://lore.kernel.org/r/20240719235107.3023592-8-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/x86_64/xapic_state_test.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index 618cd24423900..d5a7adaa95026 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -70,12 +70,10 @@ static void ____test_icr(struct xapic_vcpu *x, uint64_t val) vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); icr = (u64)(*((u32 *)&xapic.regs[APIC_ICR])) | (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32; - if (!x->is_x2apic) { + if (!x->is_x2apic) val &= (-1u | (0xffull << (32 + 24))); - TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); - } else { - TEST_ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY); - } + + TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); } #define X2APIC_RSVED_BITS_MASK (GENMASK_ULL(31,20) | \ @@ -91,7 +89,15 @@ static void __test_icr(struct xapic_vcpu *x, uint64_t val) */ val &= ~X2APIC_RSVED_BITS_MASK; } - ____test_icr(x, val | APIC_ICR_BUSY); + + /* + * The BUSY bit is reserved on both AMD and Intel, but only AMD treats + * it is as _must_ be zero. Intel simply ignores the bit. Don't test + * the BUSY bit for x2APIC, as there is no single correct behavior. + */ + if (!x->is_x2apic) + ____test_icr(x, val | APIC_ICR_BUSY); + ____test_icr(x, val & ~(u64)APIC_ICR_BUSY); } From 3426cb48adb4dc00b75e89c95d257d699f4d75ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:51:05 -0700 Subject: [PATCH 064/352] KVM: selftests: Test x2APIC ICR reserved bits Actually test x2APIC ICR reserved bits instead of deliberately skipping them. The behavior that is observed when IPI virtualization is enabled is the architecturally correct behavior, KVM is the one who was wrong, i.e. KVM was missing reserved bit checks. Fixes: 4b88b1a518b3 ("KVM: selftests: Enhance handling WRMSR ICR register in x2APIC mode") Link: https://lore.kernel.org/r/20240719235107.3023592-9-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/x86_64/xapic_state_test.c | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index d5a7adaa95026..a17b75fb25064 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -31,6 +31,10 @@ static void xapic_guest_code(void) } } +#define X2APIC_RSVD_BITS_MASK (GENMASK_ULL(31, 20) | \ + GENMASK_ULL(17, 16) | \ + GENMASK_ULL(13, 13)) + static void x2apic_guest_code(void) { asm volatile("cli"); @@ -41,7 +45,10 @@ static void x2apic_guest_code(void) uint64_t val = x2apic_read_reg(APIC_IRR) | x2apic_read_reg(APIC_IRR + 0x10) << 32; - x2apic_write_reg(APIC_ICR, val); + if (val & X2APIC_RSVD_BITS_MASK) + x2apic_write_reg_fault(APIC_ICR, val); + else + x2apic_write_reg(APIC_ICR, val); GUEST_SYNC(val); } while (1); } @@ -72,24 +79,14 @@ static void ____test_icr(struct xapic_vcpu *x, uint64_t val) (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32; if (!x->is_x2apic) val &= (-1u | (0xffull << (32 + 24))); + else if (val & X2APIC_RSVD_BITS_MASK) + return; TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); } -#define X2APIC_RSVED_BITS_MASK (GENMASK_ULL(31,20) | \ - GENMASK_ULL(17,16) | \ - GENMASK_ULL(13,13)) - static void __test_icr(struct xapic_vcpu *x, uint64_t val) { - if (x->is_x2apic) { - /* Hardware writing vICR register requires reserved bits 31:20, - * 17:16 and 13 kept as zero to avoid #GP exception. Data value - * written to vICR should mask out those bits above. - */ - val &= ~X2APIC_RSVED_BITS_MASK; - } - /* * The BUSY bit is reserved on both AMD and Intel, but only AMD treats * it is as _must_ be zero. Intel simply ignores the bit. Don't test From 0cb26ec320851f685280ff061f84855d0e97bf86 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:51:06 -0700 Subject: [PATCH 065/352] KVM: selftests: Verify the guest can read back the x2APIC ICR it wrote Now that the BUSY bit mess is gone (for x2APIC), verify that the *guest* can read back the ICR value that it wrote. Due to the divergent behavior between AMD and Intel with respect to the backing storage of the ICR in the vAPIC page, emulating a seemingly simple MSR write is quite complex. Link: https://lore.kernel.org/r/20240719235107.3023592-10-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/xapic_state_test.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index a17b75fb25064..dbd4f23ce92e8 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -45,10 +45,12 @@ static void x2apic_guest_code(void) uint64_t val = x2apic_read_reg(APIC_IRR) | x2apic_read_reg(APIC_IRR + 0x10) << 32; - if (val & X2APIC_RSVD_BITS_MASK) + if (val & X2APIC_RSVD_BITS_MASK) { x2apic_write_reg_fault(APIC_ICR, val); - else + } else { x2apic_write_reg(APIC_ICR, val); + GUEST_ASSERT_EQ(x2apic_read_reg(APIC_ICR), val); + } GUEST_SYNC(val); } while (1); } From 5a7c7d148e488f43cf9c8e64fa5e1bd715ae0485 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:51:07 -0700 Subject: [PATCH 066/352] KVM: selftests: Play nice with AMD's AVIC errata When AVIC, and thus IPI virtualization on AMD, is enabled, the CPU will virtualize ICR writes. Unfortunately, the CPU doesn't do a very good job, as it fails to clear the BUSY bit and also allows writing ICR2[23:0], despite them being "RESERVED MBZ". Account for the quirky behavior in the xapic_state test to avoid failures in a configuration that likely has no hope of ever being enabled in production. Link: https://lore.kernel.org/r/20240719235107.3023592-11-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/x86_64/xapic_state_test.c | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index dbd4f23ce92e8..88bcca188799b 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -13,6 +13,7 @@ struct xapic_vcpu { struct kvm_vcpu *vcpu; bool is_x2apic; + bool has_xavic_errata; }; static void xapic_guest_code(void) @@ -79,12 +80,17 @@ static void ____test_icr(struct xapic_vcpu *x, uint64_t val) vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); icr = (u64)(*((u32 *)&xapic.regs[APIC_ICR])) | (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32; - if (!x->is_x2apic) - val &= (-1u | (0xffull << (32 + 24))); - else if (val & X2APIC_RSVD_BITS_MASK) + if (!x->is_x2apic) { + if (!x->has_xavic_errata) + val &= (-1u | (0xffull << (32 + 24))); + } else if (val & X2APIC_RSVD_BITS_MASK) { return; + } - TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); + if (x->has_xavic_errata) + TEST_ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY); + else + TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); } static void __test_icr(struct xapic_vcpu *x, uint64_t val) @@ -236,6 +242,15 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&x.vcpu, xapic_guest_code); x.is_x2apic = false; + /* + * AMD's AVIC implementation is buggy (fails to clear the ICR BUSY bit), + * and also diverges from KVM with respect to ICR2[23:0] (KVM and Intel + * drops writes, AMD does not). Account for the errata when checking + * that KVM reads back what was written. + */ + x.has_xavic_errata = host_cpu_is_amd && + get_kvm_amd_param_bool("avic"); + vcpu_clear_cpuid_feature(x.vcpu, X86_FEATURE_X2APIC); virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); From ce3b90bd0a165c0473e462b9182e30e0659f99cf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:08:53 -0700 Subject: [PATCH 067/352] KVM: selftests: Remove unused kvm_memcmp_hva_gva() Remove sefltests' kvm_memcmp_hva_gva(), which has literally never had a single user since it was introduced by commit 783e9e51266eb ("kvm: selftests: add API testing infrastructure"). Link: https://lore.kernel.org/r/20240802200853.336512-1-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/include/kvm_util.h | 2 - tools/testing/selftests/kvm/lib/kvm_util.c | 70 ------------------- 2 files changed, 72 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 63c2aaae51f30..acd2db809e833 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -428,8 +428,6 @@ const char *vm_guest_mode_string(uint32_t i); void kvm_vm_free(struct kvm_vm *vmp); void kvm_vm_restart(struct kvm_vm *vmp); void kvm_vm_release(struct kvm_vm *vmp); -int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, - size_t len); void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename); int kvm_memfd_alloc(size_t size, bool hugepages); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 56b170b725b32..f7b7185dff106 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -794,76 +794,6 @@ int kvm_memfd_alloc(size_t size, bool hugepages) return fd; } -/* - * Memory Compare, host virtual to guest virtual - * - * Input Args: - * hva - Starting host virtual address - * vm - Virtual Machine - * gva - Starting guest virtual address - * len - number of bytes to compare - * - * Output Args: None - * - * Input/Output Args: None - * - * Return: - * Returns 0 if the bytes starting at hva for a length of len - * are equal the guest virtual bytes starting at gva. Returns - * a value < 0, if bytes at hva are less than those at gva. - * Otherwise a value > 0 is returned. - * - * Compares the bytes starting at the host virtual address hva, for - * a length of len, to the guest bytes starting at the guest virtual - * address given by gva. - */ -int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len) -{ - size_t amt; - - /* - * Compare a batch of bytes until either a match is found - * or all the bytes have been compared. - */ - for (uintptr_t offset = 0; offset < len; offset += amt) { - uintptr_t ptr1 = (uintptr_t)hva + offset; - - /* - * Determine host address for guest virtual address - * at offset. - */ - uintptr_t ptr2 = (uintptr_t)addr_gva2hva(vm, gva + offset); - - /* - * Determine amount to compare on this pass. - * Don't allow the comparsion to cross a page boundary. - */ - amt = len - offset; - if ((ptr1 >> vm->page_shift) != ((ptr1 + amt) >> vm->page_shift)) - amt = vm->page_size - (ptr1 % vm->page_size); - if ((ptr2 >> vm->page_shift) != ((ptr2 + amt) >> vm->page_shift)) - amt = vm->page_size - (ptr2 % vm->page_size); - - assert((ptr1 >> vm->page_shift) == ((ptr1 + amt - 1) >> vm->page_shift)); - assert((ptr2 >> vm->page_shift) == ((ptr2 + amt - 1) >> vm->page_shift)); - - /* - * Perform the comparison. If there is a difference - * return that result to the caller, otherwise need - * to continue on looking for a mismatch. - */ - int ret = memcmp((void *)ptr1, (void *)ptr2, amt); - if (ret != 0) - return ret; - } - - /* - * No mismatch found. Let the caller know the two memory - * areas are equal. - */ - return 0; -} - static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree, struct userspace_mem_region *region) { From c0d1a39d1d20e5e770bad72bbe1e9d4fa1367e28 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:14:29 -0700 Subject: [PATCH 068/352] KVM: selftests: Always unlink memory regions when deleting (VM free) Unlink memory regions when freeing a VM, even though it's not strictly necessary since all tracking structures are freed soon after. The time spent deleting entries is negligible, and not unlinking entries is confusing, e.g. it's easy to overlook that the tree structures are freed by the caller. Link: https://lore.kernel.org/r/20240802201429.338412-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/kvm_util.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index f7b7185dff106..a2b7df5f1d393 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -712,16 +712,13 @@ void kvm_vm_release(struct kvm_vm *vmp) } static void __vm_mem_region_delete(struct kvm_vm *vm, - struct userspace_mem_region *region, - bool unlink) + struct userspace_mem_region *region) { int ret; - if (unlink) { - rb_erase(®ion->gpa_node, &vm->regions.gpa_tree); - rb_erase(®ion->hva_node, &vm->regions.hva_tree); - hash_del(®ion->slot_node); - } + rb_erase(®ion->gpa_node, &vm->regions.gpa_tree); + rb_erase(®ion->hva_node, &vm->regions.hva_tree); + hash_del(®ion->slot_node); region->region.memory_size = 0; vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); @@ -762,7 +759,7 @@ void kvm_vm_free(struct kvm_vm *vmp) /* Free userspace_mem_regions. */ hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node) - __vm_mem_region_delete(vmp, region, false); + __vm_mem_region_delete(vmp, region); /* Free sparsebit arrays. */ sparsebit_free(&vmp->vpages_valid); @@ -1200,7 +1197,7 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) */ void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) { - __vm_mem_region_delete(vm, memslot2region(vm, slot), true); + __vm_mem_region_delete(vm, memslot2region(vm, slot)); } void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size, From 174b6e4a25ea80c2432cedd8e2760e152a6d7f82 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:38:58 -0700 Subject: [PATCH 069/352] KVM: x86/mmu: Decrease indentation in logic to sync new indirect shadow page Combine the back-to-back if-statements for synchronizing children when linking a new indirect shadow page in order to decrease the indentation, and to make it easier to "see" the logic in its entirety. No functional change intended. Link: https://lore.kernel.org/r/20240802203900.348808-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/paging_tmpl.h | 40 ++++++++++++++++------------------ 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 69941cebb3a87..0e97e080a997e 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -674,27 +674,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, false, access); - if (sp != ERR_PTR(-EEXIST)) { - /* - * We must synchronize the pagetable before linking it - * because the guest doesn't need to flush tlb when - * the gpte is changed from non-present to present. - * Otherwise, the guest may use the wrong mapping. - * - * For PG_LEVEL_4K, kvm_mmu_get_page() has already - * synchronized it transiently via kvm_sync_page(). - * - * For higher level pagetable, we synchronize it via - * the slower mmu_sync_children(). If it needs to - * break, some progress has been made; return - * RET_PF_RETRY and retry on the next #PF. - * KVM_REQ_MMU_SYNC is not necessary but it - * expedites the process. - */ - if (sp->unsync_children && - mmu_sync_children(vcpu, sp, false)) - return RET_PF_RETRY; - } + /* + * Synchronize the new page before linking it, as the CPU (KVM) + * is architecturally disallowed from inserting non-present + * entries into the TLB, i.e. the guest isn't required to flush + * the TLB when changing the gPTE from non-present to present. + * + * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already + * synchronized the page via kvm_sync_page(). + * + * For higher level pages, which cannot be unsync themselves + * but can have unsync children, synchronize via the slower + * mmu_sync_children(). If KVM needs to drop mmu_lock due to + * contention or to reschedule, instruct the caller to retry + * the #PF (mmu_sync_children() ensures forward progress will + * be made). + */ + if (sp != ERR_PTR(-EEXIST) && sp->unsync_children && + mmu_sync_children(vcpu, sp, false)) + return RET_PF_RETRY; /* * Verify that the gpte in the page we've just write From 7d67b03e6fff3934913a38674f269f55964bdd65 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:38:59 -0700 Subject: [PATCH 070/352] KVM: x86/mmu: Drop pointless "return" wrapper label in FNAME(fetch) Drop the pointless and poorly named "out_gpte_changed" label, in FNAME(fetch), and instead return RET_PF_RETRY directly. No functional change intended. Link: https://lore.kernel.org/r/20240802203900.348808-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/paging_tmpl.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 0e97e080a997e..480c541229917 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -646,10 +646,10 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * really care if it changes underneath us after this point). */ if (FNAME(gpte_changed)(vcpu, gw, top_level)) - goto out_gpte_changed; + return RET_PF_RETRY; if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) - goto out_gpte_changed; + return RET_PF_RETRY; /* * Load a new root and retry the faulting instruction in the extremely @@ -659,7 +659,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, */ if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) { kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu); - goto out_gpte_changed; + return RET_PF_RETRY; } for_each_shadow_entry(vcpu, fault->addr, it) { @@ -699,7 +699,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * protected is still there. */ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) - goto out_gpte_changed; + return RET_PF_RETRY; if (sp != ERR_PTR(-EEXIST)) link_shadow_page(vcpu, it.sptep, sp); @@ -753,9 +753,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, FNAME(pte_prefetch)(vcpu, gw, it.sptep); return ret; - -out_gpte_changed: - return RET_PF_RETRY; } /* From 1dc9cc1c4c230fbc6bf6322f150d4b81f712cfb9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:39:00 -0700 Subject: [PATCH 071/352] KVM: x86/mmu: Reword a misleading comment about checking gpte_changed() Rewrite the comment in FNAME(fetch) to explain why KVM needs to check that the gPTE is still fresh before continuing the shadow page walk, even if KVM already has a linked shadow page for the gPTE in question. No functional change intended. Link: https://lore.kernel.org/r/20240802203900.348808-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/paging_tmpl.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 480c541229917..405bd7ceee2a3 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -695,8 +695,14 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, return RET_PF_RETRY; /* - * Verify that the gpte in the page we've just write - * protected is still there. + * Verify that the gpte in the page, which is now either + * write-protected or unsync, wasn't modified between the fault + * and acquiring mmu_lock. This needs to be done even when + * reusing an existing shadow page to ensure the information + * gathered by the walker matches the information stored in the + * shadow page (which could have been modified by a different + * vCPU even if the page was already linked). Holding mmu_lock + * prevents the shadow page from changing after this point. */ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) return RET_PF_RETRY; From 48547fe75ea7d5bf1ff9425a0a5d4d32b3a77777 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:45:09 -0700 Subject: [PATCH 072/352] KVM: SVM: Add a helper to convert a SME-aware PA back to a struct page Add __sme_pa_to_page() to pair with __sme_page_pa() and use it to replace open coded equivalents, including for "iopm_base", which previously avoided having to do __sme_clr() by storing the raw PA in the global variable. Opportunistically convert __sme_page_pa() to a helper to provide type safety. No functional change intended. Link: https://lore.kernel.org/r/20240802204511.352017-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 9 ++++----- arch/x86/kvm/svm/svm.h | 16 +++++++++++++++- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d6f252555ab3f..dd1cfee3e38fc 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1124,8 +1124,7 @@ static void svm_hardware_unsetup(void) for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); - __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), - get_order(IOPM_SIZE)); + __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE)); iopm_base = 0; } @@ -1301,7 +1300,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu) if (!kvm_hlt_in_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_HLT); - control->iopm_base_pa = __sme_set(iopm_base); + control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); control->int_ctl = V_INTR_MASKING_MASK; @@ -1503,7 +1502,7 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) sev_free_vcpu(vcpu); - __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT)); + __free_page(__sme_pa_to_page(svm->vmcb01.pa)); __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } @@ -5251,7 +5250,7 @@ static __init int svm_hardware_setup(void) iopm_va = page_address(iopm_pages); memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); - iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + iopm_base = __sme_page_pa(iopm_pages); init_msrpm_offsets(); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 76107c7d0595d..2b095acdb97f1 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -25,7 +25,21 @@ #include "cpuid.h" #include "kvm_cache_regs.h" -#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) +/* + * Helpers to convert to/from physical addresses for pages whose address is + * consumed directly by hardware. Even though it's a physical address, SVM + * often restricts the address to the natural width, hence 'unsigned long' + * instead of 'hpa_t'. + */ +static inline unsigned long __sme_page_pa(struct page *page) +{ + return __sme_set(page_to_pfn(page) << PAGE_SHIFT); +} + +static inline struct page *__sme_pa_to_page(unsigned long pa) +{ + return pfn_to_page(__sme_clr(pa) >> PAGE_SHIFT); +} #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 From 1b5ef14dc656a25280d56795b73cf90dad64ad44 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:45:10 -0700 Subject: [PATCH 073/352] KVM: SVM: Add host SEV-ES save area structure into VMCB via a union Incorporate the _host_ SEV-ES save area into the VMCB as a union with the legacy save area. The SEV-ES variant used to save/load host state is larger than the legacy save area, but resides at the same offset. Prefix the field with "host" to make it as obvious as possible that the SEV-ES variant in the VMCB is only ever used for host state. Guest state for SEV-ES VMs is stored in a completely separate page (VMSA), albeit with the same layout as the host state. Add a compile-time assert to ensure the VMCB layout is correct, i.e. that KVM's layout matches the architectural definitions. No functional change intended. Link: https://lore.kernel.org/r/20240802204511.352017-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index f0dea3750ca9f..2b59b9951c90e 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -516,6 +516,20 @@ struct ghcb { u32 ghcb_usage; } __packed; +struct vmcb { + struct vmcb_control_area control; + union { + struct vmcb_save_area save; + + /* + * For SEV-ES VMs, the save area in the VMCB is used only to + * save/load host state. Guest state resides in a separate + * page, the aptly named VM Save Area (VMSA), that is encrypted + * with the guest's private key. + */ + struct sev_es_save_area host_sev_es_save; + }; +} __packed; #define EXPECTED_VMCB_SAVE_AREA_SIZE 744 #define EXPECTED_GHCB_SAVE_AREA_SIZE 1032 @@ -532,6 +546,7 @@ static inline void __unused_size_checks(void) BUILD_BUG_ON(sizeof(struct ghcb_save_area) != EXPECTED_GHCB_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); + BUILD_BUG_ON(offsetof(struct vmcb, save) != EXPECTED_VMCB_CONTROL_AREA_SIZE); BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); /* Check offsets of reserved fields */ @@ -568,11 +583,6 @@ static inline void __unused_size_checks(void) BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0); } -struct vmcb { - struct vmcb_control_area control; - struct vmcb_save_area save; -} __packed; - #define SVM_CPUID_FUNC 0x8000000a #define SVM_SELECTOR_S_SHIFT 4 From 32071fa355e73495e509a28511a81b4baab51ff6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:45:11 -0700 Subject: [PATCH 074/352] KVM: SVM: Track the per-CPU host save area as a VMCB pointer The host save area is a VMCB, track it as such to help readers follow along, but mostly to cleanup/simplify the retrieval of the SEV-ES host save area. Note, the compile-time assertion that offsetof(struct vmcb, save) == EXPECTED_VMCB_CONTROL_AREA_SIZE ensures that the SEV-ES save area is indeed at offset 0x400 (whoever added the expected/architectural VMCB offsets apparently likes decimal). No functional change intended. Link: https://lore.kernel.org/r/20240802204511.352017-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 15 ++++++++------- arch/x86/kvm/svm/svm.h | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index dd1cfee3e38fc..cac9e36960cad 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -573,7 +573,7 @@ static void __svm_write_tsc_multiplier(u64 multiplier) static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd) { - return page_address(sd->save_area) + 0x400; + return &sd->save_area->host_sev_es_save; } static inline void kvm_cpu_svm_disable(void) @@ -696,7 +696,7 @@ static void svm_cpu_uninit(int cpu) return; kfree(sd->sev_vmcbs); - __free_page(sd->save_area); + __free_page(__sme_pa_to_page(sd->save_area_pa)); sd->save_area_pa = 0; sd->save_area = NULL; } @@ -704,23 +704,24 @@ static void svm_cpu_uninit(int cpu) static int svm_cpu_init(int cpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); + struct page *save_area_page; int ret = -ENOMEM; memset(sd, 0, sizeof(struct svm_cpu_data)); - sd->save_area = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL); - if (!sd->save_area) + save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL); + if (!save_area_page) return ret; ret = sev_cpu_init(sd); if (ret) goto free_save_area; - sd->save_area_pa = __sme_page_pa(sd->save_area); + sd->save_area = page_address(save_area_page); + sd->save_area_pa = __sme_page_pa(save_area_page); return 0; free_save_area: - __free_page(sd->save_area); - sd->save_area = NULL; + __free_page(save_area_page); return ret; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 2b095acdb97f1..43fa6a16eb191 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -335,7 +335,7 @@ struct svm_cpu_data { u32 next_asid; u32 min_asid; - struct page *save_area; + struct vmcb *save_area; unsigned long save_area_pa; struct vmcb *current_vmcb; From 2f6fcfa1f4264c1f035ddd092ebd046499f7cbea Mon Sep 17 00:00:00 2001 From: Peter Gonda Date: Tue, 9 Jul 2024 11:29:36 -0700 Subject: [PATCH 075/352] KVM: selftests: Add SEV-ES shutdown test Regression test for ae20eef5 ("KVM: SVM: Update SEV-ES shutdown intercepts with more metadata"). Test confirms userspace is correctly indicated of a guest shutdown not previous behavior of an EINVAL from KVM_RUN. Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Alper Gun Cc: Tom Lendacky Cc: Michael Roth Cc: kvm@vger.kernel.org Cc: linux-kselftest@vger.kernel.org Signed-off-by: Peter Gonda Tested-by: Pratik R. Sampat Link: https://lore.kernel.org/r/20240709182936.146487-1-pgonda@google.com [sean: clobber IDT to ensure #UD leads to SHUTDOWN] Signed-off-by: Sean Christopherson --- .../selftests/kvm/x86_64/sev_smoke_test.c | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c index 7c70c0da4fb74..2e9197eb16525 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c +++ b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c @@ -160,6 +160,36 @@ static void test_sev(void *guest_code, uint64_t policy) kvm_vm_free(vm); } +static void guest_shutdown_code(void) +{ + struct desc_ptr idt; + + /* Clobber the IDT so that #UD is guaranteed to trigger SHUTDOWN. */ + memset(&idt, 0, sizeof(idt)); + __asm__ __volatile__("lidt %0" :: "m"(idt)); + + __asm__ __volatile__("ud2"); +} + +static void test_sev_es_shutdown(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + uint32_t type = KVM_X86_SEV_ES_VM; + + vm = vm_sev_create_with_one_vcpu(type, guest_shutdown_code, &vcpu); + + vm_sev_launch(vm, SEV_POLICY_ES, NULL); + + vcpu_run(vcpu); + TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SHUTDOWN, + "Wanted SHUTDOWN, got %s", + exit_reason_str(vcpu->run->exit_reason)); + + kvm_vm_free(vm); +} + int main(int argc, char *argv[]) { TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV)); @@ -171,6 +201,8 @@ int main(int argc, char *argv[]) test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG); test_sev(guest_sev_es_code, SEV_POLICY_ES); + test_sev_es_shutdown(); + if (kvm_has_cap(KVM_CAP_XCRS) && (xgetbv(0) & XFEATURE_MASK_X87_AVX) == XFEATURE_MASK_X87_AVX) { test_sync_vmsa(0); From 215b3cb7a84f8d97b81fe8536cec05a11496da51 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 28 Aug 2024 11:14:45 -0700 Subject: [PATCH 076/352] KVM: selftests: Add a test for coalesced MMIO (and PIO on x86) Add a test to verify that KVM correctly exits (or not) when a vCPU's coalesced I/O ring is full (or isn't). Iterate over all legal starting points in the ring (with an empty ring), and verify that KVM doesn't exit until the ring is full. Opportunistically verify that KVM exits immediately on non-coalesced I/O, either because the MMIO/PIO region was never registered, or because a previous region was unregistered. This is a regression test for a KVM bug where KVM would prematurely exit due to bad math resulting in a false positive if the first entry in the ring was before the halfway mark. See commit 92f6d4130497 ("KVM: Fix coalesced_mmio_has_room() to avoid premature userspace exit"). Enable the test for x86, arm64, and risc-v, i.e. all architectures except s390, which doesn't have MMIO. On x86, which has both MMIO and PIO, interleave MMIO and PIO into the same ring, as KVM shouldn't exit until a non-coalesced I/O is encountered, regardless of whether the ring is filled with MMIO, PIO, or both. Lastly, wrap the coalesced I/O ring in a structure to prepare for a potential future where KVM supports multiple ring buffers beyond KVM's "default" built-in buffer. Link: https://lore.kernel.org/all/20240820133333.1724191-1-ilstam@amazon.com Cc: Ilias Stamatis Cc: Marc Zyngier Cc: Oliver Upton Cc: Anup Patel Link: https://lore.kernel.org/r/20240828181446.652474-2-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile | 3 + .../testing/selftests/kvm/coalesced_io_test.c | 236 ++++++++++++++++++ .../testing/selftests/kvm/include/kvm_util.h | 26 ++ 3 files changed, 265 insertions(+) create mode 100644 tools/testing/selftests/kvm/coalesced_io_test.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 48d32c5aa3eb7..45cb70c048bb7 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -130,6 +130,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/max_vcpuid_cap_test TEST_GEN_PROGS_x86_64 += x86_64/triple_fault_event_test TEST_GEN_PROGS_x86_64 += x86_64/recalc_apic_map_test TEST_GEN_PROGS_x86_64 += access_tracking_perf_test +TEST_GEN_PROGS_x86_64 += coalesced_io_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += dirty_log_perf_test @@ -165,6 +166,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/vgic_lpi_stress TEST_GEN_PROGS_aarch64 += aarch64/vpmu_counter_access TEST_GEN_PROGS_aarch64 += access_tracking_perf_test TEST_GEN_PROGS_aarch64 += arch_timer +TEST_GEN_PROGS_aarch64 += coalesced_io_test TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test TEST_GEN_PROGS_aarch64 += dirty_log_perf_test @@ -198,6 +200,7 @@ TEST_GEN_PROGS_s390x += kvm_binary_stats_test TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test TEST_GEN_PROGS_riscv += riscv/ebreak_test TEST_GEN_PROGS_riscv += arch_timer +TEST_GEN_PROGS_riscv += coalesced_io_test TEST_GEN_PROGS_riscv += demand_paging_test TEST_GEN_PROGS_riscv += dirty_log_test TEST_GEN_PROGS_riscv += get-reg-list diff --git a/tools/testing/selftests/kvm/coalesced_io_test.c b/tools/testing/selftests/kvm/coalesced_io_test.c new file mode 100644 index 0000000000000..60cb254548998 --- /dev/null +++ b/tools/testing/selftests/kvm/coalesced_io_test.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "ucall_common.h" + +struct kvm_coalesced_io { + struct kvm_coalesced_mmio_ring *ring; + uint32_t ring_size; + uint64_t mmio_gpa; + uint64_t *mmio; + + /* + * x86-only, but define pio_port for all architectures to minimize the + * amount of #ifdeffery and complexity, without having to sacrifice + * verbose error messages. + */ + uint8_t pio_port; +}; + +static struct kvm_coalesced_io kvm_builtin_io_ring; + +#ifdef __x86_64__ +static const int has_pio = 1; +#else +static const int has_pio = 0; +#endif + +static void guest_code(struct kvm_coalesced_io *io) +{ + int i, j; + + for (;;) { + for (j = 0; j < 1 + has_pio; j++) { + /* + * KVM always leaves one free entry, i.e. exits to + * userspace before the last entry is filled. + */ + for (i = 0; i < io->ring_size - 1; i++) { +#ifdef __x86_64__ + if (i & 1) + outl(io->pio_port, io->pio_port + i); + else +#endif + WRITE_ONCE(*io->mmio, io->mmio_gpa + i); + } +#ifdef __x86_64__ + if (j & 1) + outl(io->pio_port, io->pio_port + i); + else +#endif + WRITE_ONCE(*io->mmio, io->mmio_gpa + i); + } + GUEST_SYNC(0); + + WRITE_ONCE(*io->mmio, io->mmio_gpa + i); +#ifdef __x86_64__ + outl(io->pio_port, io->pio_port + i); +#endif + } +} + +static void vcpu_run_and_verify_io_exit(struct kvm_vcpu *vcpu, + struct kvm_coalesced_io *io, + uint32_t ring_start, + uint32_t expected_exit) +{ + const bool want_pio = expected_exit == KVM_EXIT_IO; + struct kvm_coalesced_mmio_ring *ring = io->ring; + struct kvm_run *run = vcpu->run; + uint32_t pio_value; + + WRITE_ONCE(ring->first, ring_start); + WRITE_ONCE(ring->last, ring_start); + + vcpu_run(vcpu); + + /* + * Annoyingly, reading PIO data is safe only for PIO exits, otherwise + * data_offset is garbage, e.g. an MMIO gpa. + */ + if (run->exit_reason == KVM_EXIT_IO) + pio_value = *(uint32_t *)((void *)run + run->io.data_offset); + else + pio_value = 0; + + TEST_ASSERT((!want_pio && (run->exit_reason == KVM_EXIT_MMIO && run->mmio.is_write && + run->mmio.phys_addr == io->mmio_gpa && run->mmio.len == 8 && + *(uint64_t *)run->mmio.data == io->mmio_gpa + io->ring_size - 1)) || + (want_pio && (run->exit_reason == KVM_EXIT_IO && run->io.port == io->pio_port && + run->io.direction == KVM_EXIT_IO_OUT && run->io.count == 1 && + pio_value == io->pio_port + io->ring_size - 1)), + "For start = %u, expected exit on %u-byte %s write 0x%llx = %lx, got exit_reason = %u (%s)\n " + "(MMIO addr = 0x%llx, write = %u, len = %u, data = %lx)\n " + "(PIO port = 0x%x, write = %u, len = %u, count = %u, data = %x", + ring_start, want_pio ? 4 : 8, want_pio ? "PIO" : "MMIO", + want_pio ? (unsigned long long)io->pio_port : io->mmio_gpa, + (want_pio ? io->pio_port : io->mmio_gpa) + io->ring_size - 1, run->exit_reason, + run->exit_reason == KVM_EXIT_MMIO ? "MMIO" : run->exit_reason == KVM_EXIT_IO ? "PIO" : "other", + run->mmio.phys_addr, run->mmio.is_write, run->mmio.len, *(uint64_t *)run->mmio.data, + run->io.port, run->io.direction, run->io.size, run->io.count, pio_value); +} + +static void vcpu_run_and_verify_coalesced_io(struct kvm_vcpu *vcpu, + struct kvm_coalesced_io *io, + uint32_t ring_start, + uint32_t expected_exit) +{ + struct kvm_coalesced_mmio_ring *ring = io->ring; + int i; + + vcpu_run_and_verify_io_exit(vcpu, io, ring_start, expected_exit); + + TEST_ASSERT((ring->last + 1) % io->ring_size == ring->first, + "Expected ring to be full (minus 1), first = %u, last = %u, max = %u, start = %u", + ring->first, ring->last, io->ring_size, ring_start); + + for (i = 0; i < io->ring_size - 1; i++) { + uint32_t idx = (ring->first + i) % io->ring_size; + struct kvm_coalesced_mmio *entry = &ring->coalesced_mmio[idx]; + +#ifdef __x86_64__ + if (i & 1) + TEST_ASSERT(entry->phys_addr == io->pio_port && + entry->len == 4 && entry->pio && + *(uint32_t *)entry->data == io->pio_port + i, + "Wanted 4-byte port I/O 0x%x = 0x%x in entry %u, got %u-byte %s 0x%llx = 0x%x", + io->pio_port, io->pio_port + i, i, + entry->len, entry->pio ? "PIO" : "MMIO", + entry->phys_addr, *(uint32_t *)entry->data); + else +#endif + TEST_ASSERT(entry->phys_addr == io->mmio_gpa && + entry->len == 8 && !entry->pio, + "Wanted 8-byte MMIO to 0x%lx = %lx in entry %u, got %u-byte %s 0x%llx = 0x%lx", + io->mmio_gpa, io->mmio_gpa + i, i, + entry->len, entry->pio ? "PIO" : "MMIO", + entry->phys_addr, *(uint64_t *)entry->data); + } +} + +static void test_coalesced_io(struct kvm_vcpu *vcpu, + struct kvm_coalesced_io *io, uint32_t ring_start) +{ + struct kvm_coalesced_mmio_ring *ring = io->ring; + + kvm_vm_register_coalesced_io(vcpu->vm, io->mmio_gpa, 8, false /* pio */); +#ifdef __x86_64__ + kvm_vm_register_coalesced_io(vcpu->vm, io->pio_port, 8, true /* pio */); +#endif + + vcpu_run_and_verify_coalesced_io(vcpu, io, ring_start, KVM_EXIT_MMIO); +#ifdef __x86_64__ + vcpu_run_and_verify_coalesced_io(vcpu, io, ring_start, KVM_EXIT_IO); +#endif + + /* + * Verify ucall, which may use non-coalesced MMIO or PIO, generates an + * immediate exit. + */ + WRITE_ONCE(ring->first, ring_start); + WRITE_ONCE(ring->last, ring_start); + vcpu_run(vcpu); + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC); + TEST_ASSERT_EQ(ring->first, ring_start); + TEST_ASSERT_EQ(ring->last, ring_start); + + /* Verify that non-coalesced MMIO/PIO generates an exit to userspace. */ + kvm_vm_unregister_coalesced_io(vcpu->vm, io->mmio_gpa, 8, false /* pio */); + vcpu_run_and_verify_io_exit(vcpu, io, ring_start, KVM_EXIT_MMIO); + +#ifdef __x86_64__ + kvm_vm_unregister_coalesced_io(vcpu->vm, io->pio_port, 8, true /* pio */); + vcpu_run_and_verify_io_exit(vcpu, io, ring_start, KVM_EXIT_IO); +#endif +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int i; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_COALESCED_MMIO)); + +#ifdef __x86_64__ + TEST_REQUIRE(kvm_has_cap(KVM_CAP_COALESCED_PIO)); +#endif + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + kvm_builtin_io_ring = (struct kvm_coalesced_io) { + /* + * The I/O ring is a kernel-allocated page whose address is + * relative to each vCPU's run page, with the page offset + * provided by KVM in the return of KVM_CAP_COALESCED_MMIO. + */ + .ring = (void *)vcpu->run + + (kvm_check_cap(KVM_CAP_COALESCED_MMIO) * getpagesize()), + + /* + * The size of the I/O ring is fixed, but KVM defines the sized + * based on the kernel's PAGE_SIZE. Thus, userspace must query + * the host's page size at runtime to compute the ring size. + */ + .ring_size = (getpagesize() - sizeof(struct kvm_coalesced_mmio_ring)) / + sizeof(struct kvm_coalesced_mmio), + + /* + * Arbitrary address+port (MMIO mustn't overlap memslots), with + * the MMIO GPA identity mapped in the guest. + */ + .mmio_gpa = 4ull * SZ_1G, + .mmio = (uint64_t *)(4ull * SZ_1G), + .pio_port = 0x80, + }; + + virt_map(vm, (uint64_t)kvm_builtin_io_ring.mmio, kvm_builtin_io_ring.mmio_gpa, 1); + + sync_global_to_guest(vm, kvm_builtin_io_ring); + vcpu_args_set(vcpu, 1, &kvm_builtin_io_ring); + + for (i = 0; i < kvm_builtin_io_ring.ring_size; i++) + test_coalesced_io(vcpu, &kvm_builtin_io_ring, i); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 63c2aaae51f30..b297a84f7be50 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -460,6 +460,32 @@ static inline uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) return __vm_ioctl(vm, KVM_RESET_DIRTY_RINGS, NULL); } +static inline void kvm_vm_register_coalesced_io(struct kvm_vm *vm, + uint64_t address, + uint64_t size, bool pio) +{ + struct kvm_coalesced_mmio_zone zone = { + .addr = address, + .size = size, + .pio = pio, + }; + + vm_ioctl(vm, KVM_REGISTER_COALESCED_MMIO, &zone); +} + +static inline void kvm_vm_unregister_coalesced_io(struct kvm_vm *vm, + uint64_t address, + uint64_t size, bool pio) +{ + struct kvm_coalesced_mmio_zone zone = { + .addr = address, + .size = size, + .pio = pio, + }; + + vm_ioctl(vm, KVM_UNREGISTER_COALESCED_MMIO, &zone); +} + static inline int vm_get_stats_fd(struct kvm_vm *vm) { int fd = __vm_ioctl(vm, KVM_GET_STATS_FD, NULL); From e027ba1b83ad017a56c108eea2f42eb9f8ae5204 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 28 Aug 2024 11:14:46 -0700 Subject: [PATCH 077/352] KVM: Clean up coalesced MMIO ring full check Fold coalesced_mmio_has_room() into its sole caller, coalesced_mmio_write(), as it's really just a single line of code, has a goofy return value, and is unnecessarily brittle. E.g. if coalesced_mmio_has_room() were to check ring->last directly, or the caller failed to use READ_ONCE(), KVM would be susceptible to TOCTOU attacks from userspace. Opportunistically add a comment explaining why on earth KVM leaves one entry free, which may not be obvious to readers that aren't familiar with ring buffers. No functional change intended. Reviewed-by: Ilias Stamatis Cc: Paul Durrant Link: https://lore.kernel.org/r/20240828181446.652474-3-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/coalesced_mmio.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 184c5c40c9c19..375d6285475e9 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -40,25 +40,6 @@ static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, return 1; } -static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev, u32 last) -{ - struct kvm_coalesced_mmio_ring *ring; - - /* Are we able to batch it ? */ - - /* last is the first free entry - * check if we don't meet the first used entry - * there is always one unused entry in the buffer - */ - ring = dev->kvm->coalesced_mmio_ring; - if ((last + 1) % KVM_COALESCED_MMIO_MAX == READ_ONCE(ring->first)) { - /* full */ - return 0; - } - - return 1; -} - static int coalesced_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, const void *val) @@ -72,9 +53,15 @@ static int coalesced_mmio_write(struct kvm_vcpu *vcpu, spin_lock(&dev->kvm->ring_lock); + /* + * last is the index of the entry to fill. Verify userspace hasn't + * set last to be out of range, and that there is room in the ring. + * Leave one entry free in the ring so that userspace can differentiate + * between an empty ring and a full ring. + */ insert = READ_ONCE(ring->last); - if (!coalesced_mmio_has_room(dev, insert) || - insert >= KVM_COALESCED_MMIO_MAX) { + if (insert >= KVM_COALESCED_MMIO_MAX || + (insert + 1) % KVM_COALESCED_MMIO_MAX == READ_ONCE(ring->first)) { spin_unlock(&dev->kvm->ring_lock); return -EOPNOTSUPP; } From 9d15171f39f0996fcfaec1788d3522eb581af347 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 28 Aug 2024 14:58:00 -0700 Subject: [PATCH 078/352] KVM: selftests: Explicitly include committed one-off assets in .gitignore Add KVM selftests' one-off assets, e.g. the Makefile, to the .gitignore so that they are explicitly included. The justification for omitting the one-offs was that including them wouldn't help prevent mistakes: Deliberately do not include the one-off assets, e.g. config, settings, .gitignore itself, etc as Git doesn't ignore files that are already in the repository. Adding the one-off assets won't prevent mistakes where developers forget to --force add files that don't match the "allowed". Turns out that's not the case, as W=1 will generate warnings, and the amazing-as-always kernel test bot reports new warnings: tools/testing/selftests/kvm/.gitignore: warning: ignored by one of the .gitignore files tools/testing/selftests/kvm/Makefile: warning: ignored by one of the .gitignore files >> tools/testing/selftests/kvm/Makefile.kvm: warning: ignored by one of the .gitignore files tools/testing/selftests/kvm/config: warning: ignored by one of the .gitignore files tools/testing/selftests/kvm/settings: warning: ignored by one of the .gitignore files Fixes: 43e96957e8b8 ("KVM: selftests: Use pattern matching in .gitignore") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202408211818.85zIkDEK-lkp@intel.com Link: https://lore.kernel.org/r/20240828215800.737042-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/.gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 6d9381d60172f..7f57abf936e73 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -5,3 +5,7 @@ !*.h !*.S !*.sh +!.gitignore +!config +!settings +!Makefile From 0dd45f2cd8ccf150c6fe7a528e9a5282026ed30c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 12:51:16 -0700 Subject: [PATCH 079/352] KVM: x86: Re-enter guest if WRMSR(X2APIC_ICR) fastpath is successful Re-enter the guest in the fastpath if WRMSR emulation for x2APIC's ICR is successful, as no additional work is needed, i.e. there is no code unique for WRMSR exits between the fastpath and the "!= EXIT_FASTPATH_NONE" check in __vmx_handle_exit(). Link: https://lore.kernel.org/r/20240802195120.325560-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00e7927250525..0c97dd5a01971 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2195,7 +2195,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) data = kvm_read_edx_eax(vcpu); if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { kvm_skip_emulated_instruction(vcpu); - ret = EXIT_FASTPATH_EXIT_HANDLED; + ret = EXIT_FASTPATH_REENTER_GUEST; } break; case MSR_IA32_TSC_DEADLINE: From ea60229af7fbdcdd06d798f8340a7a9b40770c53 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 12:51:17 -0700 Subject: [PATCH 080/352] KVM: x86: Dedup fastpath MSR post-handling logic Now that the WRMSR fastpath for x2APIC_ICR and TSC_DEADLINE are identical, ignoring the backend MSR handling, consolidate the common bits of skipping the instruction and setting the return value. No functional change intended. Link: https://lore.kernel.org/r/20240802195120.325560-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0c97dd5a01971..d392e9097d633 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2186,31 +2186,32 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u32 msr = kvm_rcx_read(vcpu); u64 data; - fastpath_t ret = EXIT_FASTPATH_NONE; + fastpath_t ret; + bool handled; kvm_vcpu_srcu_read_lock(vcpu); switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): data = kvm_read_edx_eax(vcpu); - if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { - kvm_skip_emulated_instruction(vcpu); - ret = EXIT_FASTPATH_REENTER_GUEST; - } + handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); break; case MSR_IA32_TSC_DEADLINE: data = kvm_read_edx_eax(vcpu); - if (!handle_fastpath_set_tscdeadline(vcpu, data)) { - kvm_skip_emulated_instruction(vcpu); - ret = EXIT_FASTPATH_REENTER_GUEST; - } + handled = !handle_fastpath_set_tscdeadline(vcpu, data); break; default: + handled = false; break; } - if (ret != EXIT_FASTPATH_NONE) + if (handled) { + kvm_skip_emulated_instruction(vcpu); + ret = EXIT_FASTPATH_REENTER_GUEST; trace_kvm_msr_write(msr, data); + } else { + ret = EXIT_FASTPATH_NONE; + } kvm_vcpu_srcu_read_unlock(vcpu); From f7f39c50edb9d336274371953275e0d3503b9b75 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 12:51:18 -0700 Subject: [PATCH 081/352] KVM: x86: Exit to userspace if fastpath triggers one on instruction skip Exit to userspace if a fastpath handler triggers such an exit, which can happen when skipping the instruction, e.g. due to userspace single-stepping the guest via KVM_GUESTDBG_SINGLESTEP or because of an emulation failure. Fixes: 404d5d7bff0d ("KVM: X86: Introduce more exit_fastpath_completion enum values") Link: https://lore.kernel.org/r/20240802195120.325560-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f9dfb2d62053a..430a0d3693225 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -211,6 +211,7 @@ enum exit_fastpath_completion { EXIT_FASTPATH_NONE, EXIT_FASTPATH_REENTER_GUEST, EXIT_FASTPATH_EXIT_HANDLED, + EXIT_FASTPATH_EXIT_USERSPACE, }; typedef enum exit_fastpath_completion fastpath_t; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d392e9097d633..d6c81d59d4872 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2206,8 +2206,10 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) } if (handled) { - kvm_skip_emulated_instruction(vcpu); - ret = EXIT_FASTPATH_REENTER_GUEST; + if (!kvm_skip_emulated_instruction(vcpu)) + ret = EXIT_FASTPATH_EXIT_USERSPACE; + else + ret = EXIT_FASTPATH_REENTER_GUEST; trace_kvm_msr_write(msr, data); } else { ret = EXIT_FASTPATH_NONE; @@ -11196,6 +11198,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); + if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE)) + return 0; + r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath); return r; From 70cdd2385106a91675ee0ba58facde0254597416 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 12:51:19 -0700 Subject: [PATCH 082/352] KVM: x86: Reorganize code in x86.c to co-locate vCPU blocking/running helpers Shuffle code around in x86.c so that the various helpers related to vCPU blocking/running logic are (a) located near each other and (b) ordered so that HLT emulation can use kvm_vcpu_has_events() in a future path. No functional change intended. Link: https://lore.kernel.org/r/20240802195120.325560-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 264 ++++++++++++++++++++++----------------------- 1 file changed, 132 insertions(+), 132 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d6c81d59d4872..c15eb8e7d3c39 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9917,51 +9917,6 @@ void kvm_x86_vendor_exit(void) } EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); -static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) -{ - /* - * The vCPU has halted, e.g. executed HLT. Update the run state if the - * local APIC is in-kernel, the run loop will detect the non-runnable - * state and halt the vCPU. Exit to userspace if the local APIC is - * managed by userspace, in which case userspace is responsible for - * handling wake events. - */ - ++vcpu->stat.halt_exits; - if (lapic_in_kernel(vcpu)) { - vcpu->arch.mp_state = state; - return 1; - } else { - vcpu->run->exit_reason = reason; - return 0; - } -} - -int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) -{ - return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); -} -EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); - -int kvm_emulate_halt(struct kvm_vcpu *vcpu) -{ - int ret = kvm_skip_emulated_instruction(vcpu); - /* - * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered - * KVM_EXIT_DEBUG here. - */ - return kvm_emulate_halt_noskip(vcpu) && ret; -} -EXPORT_SYMBOL_GPL(kvm_emulate_halt); - -int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) -{ - int ret = kvm_skip_emulated_instruction(vcpu); - - return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, - KVM_EXIT_AP_RESET_HOLD) && ret; -} -EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); - #ifdef CONFIG_X86_64 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, unsigned long clock_type) @@ -11214,6 +11169,67 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) return r; } +static bool kvm_vcpu_running(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && + !vcpu->arch.apf.halted); +} + +static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) +{ + if (!list_empty_careful(&vcpu->async_pf.done)) + return true; + + if (kvm_apic_has_pending_init_or_sipi(vcpu) && + kvm_apic_init_sipi_allowed(vcpu)) + return true; + + if (vcpu->arch.pv.pv_unhalted) + return true; + + if (kvm_is_exception_pending(vcpu)) + return true; + + if (kvm_test_request(KVM_REQ_NMI, vcpu) || + (vcpu->arch.nmi_pending && + kvm_x86_call(nmi_allowed)(vcpu, false))) + return true; + +#ifdef CONFIG_KVM_SMM + if (kvm_test_request(KVM_REQ_SMI, vcpu) || + (vcpu->arch.smi_pending && + kvm_x86_call(smi_allowed)(vcpu, false))) + return true; +#endif + + if (kvm_test_request(KVM_REQ_PMI, vcpu)) + return true; + + if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) + return true; + + if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) + return true; + + if (kvm_hv_has_stimer_pending(vcpu)) + return true; + + if (is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->has_events && + kvm_x86_ops.nested_ops->has_events(vcpu, false)) + return true; + + if (kvm_xen_has_pending_events(vcpu)) + return true; + + return false; +} + +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); +} + /* Called within kvm->srcu read side. */ static inline int vcpu_block(struct kvm_vcpu *vcpu) { @@ -11285,12 +11301,6 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) return 1; } -static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) -{ - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted); -} - /* Called within kvm->srcu read side. */ static int vcpu_run(struct kvm_vcpu *vcpu) { @@ -11342,6 +11352,77 @@ static int vcpu_run(struct kvm_vcpu *vcpu) return r; } +static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) +{ + /* + * The vCPU has halted, e.g. executed HLT. Update the run state if the + * local APIC is in-kernel, the run loop will detect the non-runnable + * state and halt the vCPU. Exit to userspace if the local APIC is + * managed by userspace, in which case userspace is responsible for + * handling wake events. + */ + ++vcpu->stat.halt_exits; + if (lapic_in_kernel(vcpu)) { + vcpu->arch.mp_state = state; + return 1; + } else { + vcpu->run->exit_reason = reason; + return 0; + } +} + +int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) +{ + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); +} +EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); + +int kvm_emulate_halt(struct kvm_vcpu *vcpu) +{ + int ret = kvm_skip_emulated_instruction(vcpu); + /* + * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + return kvm_emulate_halt_noskip(vcpu) && ret; +} +EXPORT_SYMBOL_GPL(kvm_emulate_halt); + +int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) +{ + int ret = kvm_skip_emulated_instruction(vcpu); + + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, + KVM_EXIT_AP_RESET_HOLD) && ret; +} +EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); + +bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_apicv_active(vcpu) && + kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu); +} + +bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.preempted_in_kernel; +} + +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) + return true; + + if (kvm_test_request(KVM_REQ_NMI, vcpu) || +#ifdef CONFIG_KVM_SMM + kvm_test_request(KVM_REQ_SMI, vcpu) || +#endif + kvm_test_request(KVM_REQ_EVENT, vcpu)) + return true; + + return kvm_arch_dy_has_pending_interrupt(vcpu); +} + static inline int complete_emulated_io(struct kvm_vcpu *vcpu) { return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); @@ -13156,87 +13237,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_arch_free_memslot(kvm, old); } -static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) -{ - if (!list_empty_careful(&vcpu->async_pf.done)) - return true; - - if (kvm_apic_has_pending_init_or_sipi(vcpu) && - kvm_apic_init_sipi_allowed(vcpu)) - return true; - - if (vcpu->arch.pv.pv_unhalted) - return true; - - if (kvm_is_exception_pending(vcpu)) - return true; - - if (kvm_test_request(KVM_REQ_NMI, vcpu) || - (vcpu->arch.nmi_pending && - kvm_x86_call(nmi_allowed)(vcpu, false))) - return true; - -#ifdef CONFIG_KVM_SMM - if (kvm_test_request(KVM_REQ_SMI, vcpu) || - (vcpu->arch.smi_pending && - kvm_x86_call(smi_allowed)(vcpu, false))) - return true; -#endif - - if (kvm_test_request(KVM_REQ_PMI, vcpu)) - return true; - - if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) - return true; - - if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) - return true; - - if (kvm_hv_has_stimer_pending(vcpu)) - return true; - - if (is_guest_mode(vcpu) && - kvm_x86_ops.nested_ops->has_events && - kvm_x86_ops.nested_ops->has_events(vcpu, false)) - return true; - - if (kvm_xen_has_pending_events(vcpu)) - return true; - - return false; -} - -int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); -} - -bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_apicv_active(vcpu) && - kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu); -} - -bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.preempted_in_kernel; -} - -bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) -{ - if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) - return true; - - if (kvm_test_request(KVM_REQ_NMI, vcpu) || -#ifdef CONFIG_KVM_SMM - kvm_test_request(KVM_REQ_SMI, vcpu) || -#endif - kvm_test_request(KVM_REQ_EVENT, vcpu)) - return true; - - return kvm_arch_dy_has_pending_interrupt(vcpu); -} - bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { if (vcpu->arch.guest_state_protected) From 1876dd69dfe8c29e249070b0e4dc941fc15ac1e4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 12:51:20 -0700 Subject: [PATCH 083/352] KVM: x86: Add fastpath handling of HLT VM-Exits Add a fastpath for HLT VM-Exits by immediately re-entering the guest if it has a pending wake event. When virtual interrupt delivery is enabled, i.e. when KVM doesn't need to manually inject interrupts, this allows KVM to stay in the fastpath run loop when a vIRQ arrives between the guest doing CLI and STI;HLT. Without AMD's Idle HLT-intercept support, the CPU generates a HLT VM-Exit even though KVM will immediately resume the guest. Note, on bare metal, it's relatively uncommon for a modern guest kernel to actually trigger this scenario, as the window between the guest checking for a wake event and committing to HLT is quite small. But in a nested environment, the timings change significantly, e.g. rudimentary testing showed that ~50% of HLT exits where HLT-polling was successful would be serviced by this fastpath, i.e. ~50% of the time that a nested vCPU gets a wake event before KVM schedules out the vCPU, the wake event was pending even before the VM-Exit. Link: https://lore.kernel.org/all/20240528041926.3989-3-manali.shukla@amd.com Link: https://lore.kernel.org/r/20240802195120.325560-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 13 +++++++++++-- arch/x86/kvm/vmx/vmx.c | 2 ++ arch/x86/kvm/x86.c | 23 ++++++++++++++++++++++- arch/x86/kvm/x86.h | 1 + 4 files changed, 36 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index eb3de01602b96..eb64976590fba 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4147,12 +4147,21 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + if (is_guest_mode(vcpu)) return EXIT_FASTPATH_NONE; - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && - to_svm(vcpu)->vmcb->control.exit_info_1) + switch (svm->vmcb->control.exit_code) { + case SVM_EXIT_MSR: + if (!svm->vmcb->control.exit_info_1) + break; return handle_fastpath_set_msr_irqoff(vcpu); + case SVM_EXIT_HLT: + return handle_fastpath_hlt(vcpu); + default: + break; + } return EXIT_FASTPATH_NONE; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cf85f8d50ccb3..9cac0ffb85535 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7265,6 +7265,8 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); + case EXIT_REASON_HLT: + return handle_fastpath_hlt(vcpu); default: return EXIT_FASTPATH_NONE; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c15eb8e7d3c39..fa455a60b5574 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11363,7 +11363,10 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) */ ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { - vcpu->arch.mp_state = state; + if (kvm_vcpu_has_events(vcpu)) + vcpu->arch.pv.pv_unhalted = false; + else + vcpu->arch.mp_state = state; return 1; } else { vcpu->run->exit_reason = reason; @@ -11388,6 +11391,24 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); +fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) +{ + int ret; + + kvm_vcpu_srcu_read_lock(vcpu); + ret = kvm_emulate_halt(vcpu); + kvm_vcpu_srcu_read_unlock(vcpu); + + if (!ret) + return EXIT_FASTPATH_EXIT_USERSPACE; + + if (kvm_vcpu_running(vcpu)) + return EXIT_FASTPATH_REENTER_GUEST; + + return EXIT_FASTPATH_EXIT_HANDLED; +} +EXPORT_SYMBOL_GPL(handle_fastpath_hlt); + int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) { int ret = kvm_skip_emulated_instruction(vcpu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f47b9905ba782..516eb9e28752e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -334,6 +334,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); +fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu); extern struct kvm_caps kvm_caps; extern struct kvm_host_values kvm_host; From 44d17459626052a2390457e550a12cb973506b2f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:51 -0700 Subject: [PATCH 084/352] KVM: Use dedicated mutex to protect kvm_usage_count to avoid deadlock Use a dedicated mutex to guard kvm_usage_count to fix a potential deadlock on x86 due to a chain of locks and SRCU synchronizations. Translating the below lockdep splat, CPU1 #6 will wait on CPU0 #1, CPU0 #8 will wait on CPU2 #3, and CPU2 #7 will wait on CPU1 #4 (if there's a writer, due to the fairness of r/w semaphores). CPU0 CPU1 CPU2 1 lock(&kvm->slots_lock); 2 lock(&vcpu->mutex); 3 lock(&kvm->srcu); 4 lock(cpu_hotplug_lock); 5 lock(kvm_lock); 6 lock(&kvm->slots_lock); 7 lock(cpu_hotplug_lock); 8 sync(&kvm->srcu); Note, there are likely more potential deadlocks in KVM x86, e.g. the same pattern of taking cpu_hotplug_lock outside of kvm_lock likely exists with __kvmclock_cpufreq_notifier(): cpuhp_cpufreq_online() | -> cpufreq_online() | -> cpufreq_gov_performance_limits() | -> __cpufreq_driver_target() | -> __target_index() | -> cpufreq_freq_transition_begin() | -> cpufreq_notify_transition() | -> ... __kvmclock_cpufreq_notifier() But, actually triggering such deadlocks is beyond rare due to the combination of dependencies and timings involved. E.g. the cpufreq notifier is only used on older CPUs without a constant TSC, mucking with the NX hugepage mitigation while VMs are running is very uncommon, and doing so while also onlining/offlining a CPU (necessary to generate contention on cpu_hotplug_lock) would be even more unusual. The most robust solution to the general cpu_hotplug_lock issue is likely to switch vm_list to be an RCU-protected list, e.g. so that x86's cpufreq notifier doesn't to take kvm_lock. For now, settle for fixing the most blatant deadlock, as switching to an RCU-protected list is a much more involved change, but add a comment in locking.rst to call out that care needs to be taken when walking holding kvm_lock and walking vm_list. ====================================================== WARNING: possible circular locking dependency detected 6.10.0-smp--c257535a0c9d-pip #330 Tainted: G S O ------------------------------------------------------ tee/35048 is trying to acquire lock: ff6a80eced71e0a8 (&kvm->slots_lock){+.+.}-{3:3}, at: set_nx_huge_pages+0x179/0x1e0 [kvm] but task is already holding lock: ffffffffc07abb08 (kvm_lock){+.+.}-{3:3}, at: set_nx_huge_pages+0x14a/0x1e0 [kvm] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (kvm_lock){+.+.}-{3:3}: __mutex_lock+0x6a/0xb40 mutex_lock_nested+0x1f/0x30 kvm_dev_ioctl+0x4fb/0xe50 [kvm] __se_sys_ioctl+0x7b/0xd0 __x64_sys_ioctl+0x21/0x30 x64_sys_call+0x15d0/0x2e60 do_syscall_64+0x83/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #2 (cpu_hotplug_lock){++++}-{0:0}: cpus_read_lock+0x2e/0xb0 static_key_slow_inc+0x16/0x30 kvm_lapic_set_base+0x6a/0x1c0 [kvm] kvm_set_apic_base+0x8f/0xe0 [kvm] kvm_set_msr_common+0x9ae/0xf80 [kvm] vmx_set_msr+0xa54/0xbe0 [kvm_intel] __kvm_set_msr+0xb6/0x1a0 [kvm] kvm_arch_vcpu_ioctl+0xeca/0x10c0 [kvm] kvm_vcpu_ioctl+0x485/0x5b0 [kvm] __se_sys_ioctl+0x7b/0xd0 __x64_sys_ioctl+0x21/0x30 x64_sys_call+0x15d0/0x2e60 do_syscall_64+0x83/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #1 (&kvm->srcu){.+.+}-{0:0}: __synchronize_srcu+0x44/0x1a0 synchronize_srcu_expedited+0x21/0x30 kvm_swap_active_memslots+0x110/0x1c0 [kvm] kvm_set_memslot+0x360/0x620 [kvm] __kvm_set_memory_region+0x27b/0x300 [kvm] kvm_vm_ioctl_set_memory_region+0x43/0x60 [kvm] kvm_vm_ioctl+0x295/0x650 [kvm] __se_sys_ioctl+0x7b/0xd0 __x64_sys_ioctl+0x21/0x30 x64_sys_call+0x15d0/0x2e60 do_syscall_64+0x83/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #0 (&kvm->slots_lock){+.+.}-{3:3}: __lock_acquire+0x15ef/0x2e30 lock_acquire+0xe0/0x260 __mutex_lock+0x6a/0xb40 mutex_lock_nested+0x1f/0x30 set_nx_huge_pages+0x179/0x1e0 [kvm] param_attr_store+0x93/0x100 module_attr_store+0x22/0x40 sysfs_kf_write+0x81/0xb0 kernfs_fop_write_iter+0x133/0x1d0 vfs_write+0x28d/0x380 ksys_write+0x70/0xe0 __x64_sys_write+0x1f/0x30 x64_sys_call+0x281b/0x2e60 do_syscall_64+0x83/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e Cc: Chao Gao Fixes: 0bf50497f03b ("KVM: Drop kvm_count_lock and instead protect kvm_usage_count with kvm_lock") Cc: stable@vger.kernel.org Reviewed-by: Kai Huang Acked-by: Kai Huang Tested-by: Farrah Chen Signed-off-by: Sean Christopherson Message-ID: <20240830043600.127750-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/locking.rst | 32 +++++++++++++++++++++--------- virt/kvm/kvm_main.c | 31 +++++++++++++++-------------- 2 files changed, 39 insertions(+), 24 deletions(-) diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 02880d5552d5f..c0cb5ce51c1e0 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -9,7 +9,7 @@ KVM Lock Overview The acquisition orders for mutexes are as follows: -- cpus_read_lock() is taken outside kvm_lock +- cpus_read_lock() is taken outside kvm_lock and kvm_usage_lock - kvm->lock is taken outside vcpu->mutex @@ -24,6 +24,12 @@ The acquisition orders for mutexes are as follows: are taken on the waiting side when modifying memslots, so MMU notifiers must not take either kvm->slots_lock or kvm->slots_arch_lock. +cpus_read_lock() vs kvm_lock: +- Taking cpus_read_lock() outside of kvm_lock is problematic, despite that + being the official ordering, as it is quite easy to unknowingly trigger + cpus_read_lock() while holding kvm_lock. Use caution when walking vm_list, + e.g. avoid complex operations when possible. + For SRCU: - ``synchronize_srcu(&kvm->srcu)`` is called inside critical sections @@ -227,10 +233,17 @@ time it will be set using the Dirty tracking mechanism described above. :Type: mutex :Arch: any :Protects: - vm_list - - kvm_usage_count + +``kvm_usage_lock`` +^^^^^^^^^^^^^^^^^^ + +:Type: mutex +:Arch: any +:Protects: - kvm_usage_count - hardware virtualization enable/disable -:Comment: KVM also disables CPU hotplug via cpus_read_lock() during - enable/disable. +:Comment: Exists because using kvm_lock leads to deadlock (see earlier comment + on cpus_read_lock() vs kvm_lock). Note, KVM also disables CPU hotplug via + cpus_read_lock() when enabling/disabling virtualization. ``kvm->mn_invalidate_lock`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -290,11 +303,12 @@ time it will be set using the Dirty tracking mechanism described above. wakeup. ``vendor_module_lock`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^ :Type: mutex :Arch: x86 :Protects: loading a vendor module (kvm_amd or kvm_intel) -:Comment: Exists because using kvm_lock leads to deadlock. cpu_hotplug_lock is - taken outside of kvm_lock, e.g. in KVM's CPU online/offline callbacks, and - many operations need to take cpu_hotplug_lock when loading a vendor module, - e.g. updating static calls. +:Comment: Exists because using kvm_lock leads to deadlock. kvm_lock is taken + in notifiers, e.g. __kvmclock_cpufreq_notifier(), that may be invoked while + cpu_hotplug_lock is held, e.g. from cpufreq_boost_trigger_state(), and many + operations need to take cpu_hotplug_lock when loading a vendor module, e.g. + updating static calls. diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cb2b78e92910f..7164a9ece2087 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5575,6 +5575,7 @@ __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); static DEFINE_PER_CPU(bool, hardware_enabled); +static DEFINE_MUTEX(kvm_usage_lock); static int kvm_usage_count; static int __hardware_enable_nolock(void) @@ -5607,10 +5608,10 @@ static int kvm_online_cpu(unsigned int cpu) * be enabled. Otherwise running VMs would encounter unrecoverable * errors when scheduled to this CPU. */ - mutex_lock(&kvm_lock); + mutex_lock(&kvm_usage_lock); if (kvm_usage_count) ret = __hardware_enable_nolock(); - mutex_unlock(&kvm_lock); + mutex_unlock(&kvm_usage_lock); return ret; } @@ -5630,10 +5631,10 @@ static void hardware_disable_nolock(void *junk) static int kvm_offline_cpu(unsigned int cpu) { - mutex_lock(&kvm_lock); + mutex_lock(&kvm_usage_lock); if (kvm_usage_count) hardware_disable_nolock(NULL); - mutex_unlock(&kvm_lock); + mutex_unlock(&kvm_usage_lock); return 0; } @@ -5649,9 +5650,9 @@ static void hardware_disable_all_nolock(void) static void hardware_disable_all(void) { cpus_read_lock(); - mutex_lock(&kvm_lock); + mutex_lock(&kvm_usage_lock); hardware_disable_all_nolock(); - mutex_unlock(&kvm_lock); + mutex_unlock(&kvm_usage_lock); cpus_read_unlock(); } @@ -5682,7 +5683,7 @@ static int hardware_enable_all(void) * enable hardware multiple times. */ cpus_read_lock(); - mutex_lock(&kvm_lock); + mutex_lock(&kvm_usage_lock); r = 0; @@ -5696,7 +5697,7 @@ static int hardware_enable_all(void) } } - mutex_unlock(&kvm_lock); + mutex_unlock(&kvm_usage_lock); cpus_read_unlock(); return r; @@ -5724,13 +5725,13 @@ static int kvm_suspend(void) { /* * Secondary CPUs and CPU hotplug are disabled across the suspend/resume - * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count - * is stable. Assert that kvm_lock is not held to ensure the system - * isn't suspended while KVM is enabling hardware. Hardware enabling - * can be preempted, but the task cannot be frozen until it has dropped - * all locks (userspace tasks are frozen via a fake signal). + * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage + * count is stable. Assert that kvm_usage_lock is not held to ensure + * the system isn't suspended while KVM is enabling hardware. Hardware + * enabling can be preempted, but the task cannot be frozen until it has + * dropped all locks (userspace tasks are frozen via a fake signal). */ - lockdep_assert_not_held(&kvm_lock); + lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); if (kvm_usage_count) @@ -5740,7 +5741,7 @@ static int kvm_suspend(void) static void kvm_resume(void) { - lockdep_assert_not_held(&kvm_lock); + lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); if (kvm_usage_count) From 9a798b1337afe4fdcce53efa77953b068d8614f5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:52 -0700 Subject: [PATCH 085/352] KVM: Register cpuhp and syscore callbacks when enabling hardware Register KVM's cpuhp and syscore callback when enabling virtualization in hardware instead of registering the callbacks during initialization, and let the CPU up/down framework invoke the inner enable/disable functions. Registering the callbacks during initialization makes things more complex than they need to be, as KVM needs to be very careful about handling races between enabling CPUs being onlined/offlined and hardware being enabled/disabled. Intel TDX support will require KVM to enable virtualization during KVM initialization, i.e. will add another wrinkle to things, at which point sorting out the potential races with kvm_usage_count would become even more complex. Note, using the cpuhp framework has a subtle behavioral change: enabling will be done serially across all CPUs, whereas KVM currently sends an IPI to all CPUs in parallel. While serializing virtualization enabling could create undesirable latency, the issue is limited to the 0=>1 transition of VM creation. And even that can be mitigated, e.g. by letting userspace force virtualization to be enabled when KVM is initialized. Cc: Chao Gao Reviewed-by: Kai Huang Acked-by: Kai Huang Tested-by: Farrah Chen Signed-off-by: Sean Christopherson Message-ID: <20240830043600.127750-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/locking.rst | 9 +- virt/kvm/kvm_main.c | 174 ++++++++++------------------- 2 files changed, 66 insertions(+), 117 deletions(-) diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index c0cb5ce51c1e0..be3c323888b19 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -9,7 +9,9 @@ KVM Lock Overview The acquisition orders for mutexes are as follows: -- cpus_read_lock() is taken outside kvm_lock and kvm_usage_lock +- cpus_read_lock() is taken outside kvm_lock + +- kvm_usage_lock is taken outside cpus_read_lock() - kvm->lock is taken outside vcpu->mutex @@ -241,9 +243,8 @@ time it will be set using the Dirty tracking mechanism described above. :Arch: any :Protects: - kvm_usage_count - hardware virtualization enable/disable -:Comment: Exists because using kvm_lock leads to deadlock (see earlier comment - on cpus_read_lock() vs kvm_lock). Note, KVM also disables CPU hotplug via - cpus_read_lock() when enabling/disabling virtualization. +:Comment: Exists to allow taking cpus_read_lock() while kvm_usage_count is + protected, which simplifies the virtualization enabling logic. ``kvm->mn_invalidate_lock`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7164a9ece2087..9ad8903b575aa 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5578,7 +5578,7 @@ static DEFINE_PER_CPU(bool, hardware_enabled); static DEFINE_MUTEX(kvm_usage_lock); static int kvm_usage_count; -static int __hardware_enable_nolock(void) +static int hardware_enable_nolock(void) { if (__this_cpu_read(hardware_enabled)) return 0; @@ -5593,34 +5593,18 @@ static int __hardware_enable_nolock(void) return 0; } -static void hardware_enable_nolock(void *failed) -{ - if (__hardware_enable_nolock()) - atomic_inc(failed); -} - static int kvm_online_cpu(unsigned int cpu) { - int ret = 0; - /* * Abort the CPU online process if hardware virtualization cannot * be enabled. Otherwise running VMs would encounter unrecoverable * errors when scheduled to this CPU. */ - mutex_lock(&kvm_usage_lock); - if (kvm_usage_count) - ret = __hardware_enable_nolock(); - mutex_unlock(&kvm_usage_lock); - return ret; + return hardware_enable_nolock(); } static void hardware_disable_nolock(void *junk) { - /* - * Note, hardware_disable_all_nolock() tells all online CPUs to disable - * hardware, not just CPUs that successfully enabled hardware! - */ if (!__this_cpu_read(hardware_enabled)) return; @@ -5631,78 +5615,10 @@ static void hardware_disable_nolock(void *junk) static int kvm_offline_cpu(unsigned int cpu) { - mutex_lock(&kvm_usage_lock); - if (kvm_usage_count) - hardware_disable_nolock(NULL); - mutex_unlock(&kvm_usage_lock); + hardware_disable_nolock(NULL); return 0; } -static void hardware_disable_all_nolock(void) -{ - BUG_ON(!kvm_usage_count); - - kvm_usage_count--; - if (!kvm_usage_count) - on_each_cpu(hardware_disable_nolock, NULL, 1); -} - -static void hardware_disable_all(void) -{ - cpus_read_lock(); - mutex_lock(&kvm_usage_lock); - hardware_disable_all_nolock(); - mutex_unlock(&kvm_usage_lock); - cpus_read_unlock(); -} - -static int hardware_enable_all(void) -{ - atomic_t failed = ATOMIC_INIT(0); - int r; - - /* - * Do not enable hardware virtualization if the system is going down. - * If userspace initiated a forced reboot, e.g. reboot -f, then it's - * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling - * after kvm_reboot() is called. Note, this relies on system_state - * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops - * hook instead of registering a dedicated reboot notifier (the latter - * runs before system_state is updated). - */ - if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || - system_state == SYSTEM_RESTART) - return -EBUSY; - - /* - * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu() - * is called, and so on_each_cpu() between them includes the CPU that - * is being onlined. As a result, hardware_enable_nolock() may get - * invoked before kvm_online_cpu(), which also enables hardware if the - * usage count is non-zero. Disable CPU hotplug to avoid attempting to - * enable hardware multiple times. - */ - cpus_read_lock(); - mutex_lock(&kvm_usage_lock); - - r = 0; - - kvm_usage_count++; - if (kvm_usage_count == 1) { - on_each_cpu(hardware_enable_nolock, &failed, 1); - - if (atomic_read(&failed)) { - hardware_disable_all_nolock(); - r = -EBUSY; - } - } - - mutex_unlock(&kvm_usage_lock); - cpus_read_unlock(); - - return r; -} - static void kvm_shutdown(void) { /* @@ -5734,8 +5650,7 @@ static int kvm_suspend(void) lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); - if (kvm_usage_count) - hardware_disable_nolock(NULL); + hardware_disable_nolock(NULL); return 0; } @@ -5744,8 +5659,7 @@ static void kvm_resume(void) lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); - if (kvm_usage_count) - WARN_ON_ONCE(__hardware_enable_nolock()); + WARN_ON_ONCE(hardware_enable_nolock()); } static struct syscore_ops kvm_syscore_ops = { @@ -5753,6 +5667,60 @@ static struct syscore_ops kvm_syscore_ops = { .resume = kvm_resume, .shutdown = kvm_shutdown, }; + +static int hardware_enable_all(void) +{ + int r; + + guard(mutex)(&kvm_usage_lock); + + if (kvm_usage_count++) + return 0; + + r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", + kvm_online_cpu, kvm_offline_cpu); + if (r) + goto err_cpuhp; + + register_syscore_ops(&kvm_syscore_ops); + + /* + * Undo virtualization enabling and bail if the system is going down. + * If userspace initiated a forced reboot, e.g. reboot -f, then it's + * possible for an in-flight operation to enable virtualization after + * syscore_shutdown() is called, i.e. without kvm_shutdown() being + * invoked. Note, this relies on system_state being set _before_ + * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked + * or this CPU observes the impending shutdown. Which is why KVM uses + * a syscore ops hook instead of registering a dedicated reboot + * notifier (the latter runs before system_state is updated). + */ + if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || + system_state == SYSTEM_RESTART) { + r = -EBUSY; + goto err_rebooting; + } + + return 0; + +err_rebooting: + unregister_syscore_ops(&kvm_syscore_ops); + cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); +err_cpuhp: + --kvm_usage_count; + return r; +} + +static void hardware_disable_all(void) +{ + guard(mutex)(&kvm_usage_lock); + + if (--kvm_usage_count) + return; + + unregister_syscore_ops(&kvm_syscore_ops); + cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); +} #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ static int hardware_enable_all(void) { @@ -6461,15 +6429,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) int r; int cpu; -#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", - kvm_online_cpu, kvm_offline_cpu); - if (r) - return r; - - register_syscore_ops(&kvm_syscore_ops); -#endif - /* A kmem cache lets us meet the alignment requirements of fx_save. */ if (!vcpu_align) vcpu_align = __alignof__(struct kvm_vcpu); @@ -6480,10 +6439,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) offsetofend(struct kvm_vcpu, stats_id) - offsetof(struct kvm_vcpu, arch), NULL); - if (!kvm_vcpu_cache) { - r = -ENOMEM; - goto err_vcpu_cache; - } + if (!kvm_vcpu_cache) + return -ENOMEM; for_each_possible_cpu(cpu) { if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu), @@ -6540,11 +6497,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); -err_vcpu_cache: -#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING - unregister_syscore_ops(&kvm_syscore_ops); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); -#endif return r; } EXPORT_SYMBOL_GPL(kvm_init); @@ -6566,10 +6518,6 @@ void kvm_exit(void) kmem_cache_destroy(kvm_vcpu_cache); kvm_vfio_ops_exit(); kvm_async_pf_deinit(); -#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING - unregister_syscore_ops(&kvm_syscore_ops); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); -#endif kvm_irqfd_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); From 70c0194337d38dd29533e63e3cb07620f8c5eae1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:53 -0700 Subject: [PATCH 086/352] KVM: Rename symbols related to enabling virtualization hardware Rename the various functions (and a variable) that enable virtualization to prepare for upcoming changes, and to clean up artifacts of KVM's previous behavior, which required manually juggling locks around kvm_usage_count. Drop the "nolock" qualifier from per-CPU functions now that there are no "nolock" implementations of the "all" variants, i.e. now that calling a non-nolock function from a nolock function isn't confusing (unlike this sentence). Drop "all" from the outer helpers as they no longer manually iterate over all CPUs, and because it might not be obvious what "all" refers to. In lieu of the above qualifiers, append "_cpu" to the end of the functions that are per-CPU helpers for the outer APIs. Opportunistically prepend "kvm" to all functions to help make it clear that they are KVM helpers, but mostly because there's no reason not to. Lastly, use "virtualization" instead of "hardware", because while the functions do enable virtualization in hardware, there are a _lot_ of things that KVM enables in hardware. Defer renaming the arch hooks to future patches, purely to reduce the amount of churn in a single commit. Reviewed-by: Chao Gao Reviewed-by: Kai Huang Acked-by: Kai Huang Tested-by: Farrah Chen Signed-off-by: Sean Christopherson Message-ID: <20240830043600.127750-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9ad8903b575aa..4efbf9f841496 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -136,8 +136,8 @@ static int kvm_no_compat_open(struct inode *inode, struct file *file) #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ .open = kvm_no_compat_open #endif -static int hardware_enable_all(void); -static void hardware_disable_all(void); +static int kvm_enable_virtualization(void); +static void kvm_disable_virtualization(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); @@ -1220,7 +1220,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) if (r) goto out_err_no_arch_destroy_vm; - r = hardware_enable_all(); + r = kvm_enable_virtualization(); if (r) goto out_err_no_disable; @@ -1263,7 +1263,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); #endif out_err_no_mmu_notifier: - hardware_disable_all(); + kvm_disable_virtualization(); out_err_no_disable: kvm_arch_destroy_vm(kvm); out_err_no_arch_destroy_vm: @@ -1360,7 +1360,7 @@ static void kvm_destroy_vm(struct kvm *kvm) #endif kvm_arch_free_vm(kvm); preempt_notifier_dec(); - hardware_disable_all(); + kvm_disable_virtualization(); mmdrop(mm); } @@ -5574,13 +5574,13 @@ static struct miscdevice kvm_dev = { __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); -static DEFINE_PER_CPU(bool, hardware_enabled); +static DEFINE_PER_CPU(bool, virtualization_enabled); static DEFINE_MUTEX(kvm_usage_lock); static int kvm_usage_count; -static int hardware_enable_nolock(void) +static int kvm_enable_virtualization_cpu(void) { - if (__this_cpu_read(hardware_enabled)) + if (__this_cpu_read(virtualization_enabled)) return 0; if (kvm_arch_hardware_enable()) { @@ -5589,7 +5589,7 @@ static int hardware_enable_nolock(void) return -EIO; } - __this_cpu_write(hardware_enabled, true); + __this_cpu_write(virtualization_enabled, true); return 0; } @@ -5600,22 +5600,22 @@ static int kvm_online_cpu(unsigned int cpu) * be enabled. Otherwise running VMs would encounter unrecoverable * errors when scheduled to this CPU. */ - return hardware_enable_nolock(); + return kvm_enable_virtualization_cpu(); } -static void hardware_disable_nolock(void *junk) +static void kvm_disable_virtualization_cpu(void *ign) { - if (!__this_cpu_read(hardware_enabled)) + if (!__this_cpu_read(virtualization_enabled)) return; kvm_arch_hardware_disable(); - __this_cpu_write(hardware_enabled, false); + __this_cpu_write(virtualization_enabled, false); } static int kvm_offline_cpu(unsigned int cpu) { - hardware_disable_nolock(NULL); + kvm_disable_virtualization_cpu(NULL); return 0; } @@ -5634,7 +5634,7 @@ static void kvm_shutdown(void) */ pr_info("kvm: exiting hardware virtualization\n"); kvm_rebooting = true; - on_each_cpu(hardware_disable_nolock, NULL, 1); + on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1); } static int kvm_suspend(void) @@ -5650,7 +5650,7 @@ static int kvm_suspend(void) lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); - hardware_disable_nolock(NULL); + kvm_disable_virtualization_cpu(NULL); return 0; } @@ -5659,7 +5659,7 @@ static void kvm_resume(void) lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); - WARN_ON_ONCE(hardware_enable_nolock()); + WARN_ON_ONCE(kvm_enable_virtualization_cpu()); } static struct syscore_ops kvm_syscore_ops = { @@ -5668,7 +5668,7 @@ static struct syscore_ops kvm_syscore_ops = { .shutdown = kvm_shutdown, }; -static int hardware_enable_all(void) +static int kvm_enable_virtualization(void) { int r; @@ -5711,7 +5711,7 @@ static int hardware_enable_all(void) return r; } -static void hardware_disable_all(void) +static void kvm_disable_virtualization(void) { guard(mutex)(&kvm_usage_lock); @@ -5722,12 +5722,12 @@ static void hardware_disable_all(void) cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); } #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ -static int hardware_enable_all(void) +static int kvm_enable_virtualization(void) { return 0; } -static void hardware_disable_all(void) +static void kvm_disable_virtualization(void) { } From 071f24ad28cdcdfa544fdb79b1b1d2b423717a11 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:54 -0700 Subject: [PATCH 087/352] KVM: Rename arch hooks related to per-CPU virtualization enabling Rename the per-CPU hooks used to enable virtualization in hardware to align with the KVM-wide helpers in kvm_main.c, and to better capture that the callbacks are invoked on every online CPU. No functional change intended. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Reviewed-by: Kai Huang Message-ID: <20240830043600.127750-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/arm.c | 6 +++--- arch/loongarch/kvm/main.c | 4 ++-- arch/mips/kvm/mips.c | 4 ++-- arch/riscv/kvm/main.c | 4 ++-- arch/x86/kvm/x86.c | 6 +++--- include/linux/kvm_host.h | 4 ++-- virt/kvm/kvm_main.c | 4 ++-- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 9bef7638342ef..9c8f5390ec63b 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2163,7 +2163,7 @@ static void cpu_hyp_uninit(void *discard) } } -int kvm_arch_hardware_enable(void) +int kvm_arch_enable_virtualization_cpu(void) { /* * Most calls to this function are made with migration @@ -2183,7 +2183,7 @@ int kvm_arch_hardware_enable(void) return 0; } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { kvm_timer_cpu_down(); kvm_vgic_cpu_down(); @@ -2379,7 +2379,7 @@ static int __init do_pkvm_init(u32 hyp_va_bits) /* * The stub hypercalls are now disabled, so set our local flag to - * prevent a later re-init attempt in kvm_arch_hardware_enable(). + * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu(). */ __this_cpu_write(kvm_hyp_initialized, 1); preempt_enable(); diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index 844736b99d387..27e9b94c0a0b6 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -261,7 +261,7 @@ long kvm_arch_dev_ioctl(struct file *filp, return -ENOIOCTLCMD; } -int kvm_arch_hardware_enable(void) +int kvm_arch_enable_virtualization_cpu(void) { unsigned long env, gcfg = 0; @@ -300,7 +300,7 @@ int kvm_arch_hardware_enable(void) return 0; } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { write_csr_gcfg(0); write_csr_gstat(0); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index b5de770b092e3..52e1f275351e8 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -125,12 +125,12 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return 1; } -int kvm_arch_hardware_enable(void) +int kvm_arch_enable_virtualization_cpu(void) { return kvm_mips_callbacks->hardware_enable(); } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { kvm_mips_callbacks->hardware_disable(); } diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index bab2ec34cd876..f3427f6de6080 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -20,7 +20,7 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } -int kvm_arch_hardware_enable(void) +int kvm_arch_enable_virtualization_cpu(void) { csr_write(CSR_HEDELEG, KVM_HEDELEG_DEFAULT); csr_write(CSR_HIDELEG, KVM_HIDELEG_DEFAULT); @@ -35,7 +35,7 @@ int kvm_arch_hardware_enable(void) return 0; } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { kvm_riscv_aia_disable(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 70219e4069874..1182baf0d4874 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -355,7 +355,7 @@ static void kvm_on_user_return(struct user_return_notifier *urn) /* * Disabling irqs at this point since the following code could be - * interrupted and executed through kvm_arch_hardware_disable() + * interrupted and executed through kvm_arch_disable_virtualization_cpu() */ local_irq_save(flags); if (msrs->registered) { @@ -12512,7 +12512,7 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) } EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); -int kvm_arch_hardware_enable(void) +int kvm_arch_enable_virtualization_cpu(void) { struct kvm *kvm; struct kvm_vcpu *vcpu; @@ -12608,7 +12608,7 @@ int kvm_arch_hardware_enable(void) return 0; } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { kvm_x86_call(hardware_disable)(); drop_user_return_notifiers(); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b23c6d48392f7..4ceecba64f932 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1521,8 +1521,8 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {} #endif #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING -int kvm_arch_hardware_enable(void); -void kvm_arch_hardware_disable(void); +int kvm_arch_enable_virtualization_cpu(void); +void kvm_arch_disable_virtualization_cpu(void); #endif int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4efbf9f841496..7ada2b669d51c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5583,7 +5583,7 @@ static int kvm_enable_virtualization_cpu(void) if (__this_cpu_read(virtualization_enabled)) return 0; - if (kvm_arch_hardware_enable()) { + if (kvm_arch_enable_virtualization_cpu()) { pr_info("kvm: enabling virtualization on CPU%d failed\n", raw_smp_processor_id()); return -EIO; @@ -5608,7 +5608,7 @@ static void kvm_disable_virtualization_cpu(void *ign) if (!__this_cpu_read(virtualization_enabled)) return; - kvm_arch_hardware_disable(); + kvm_arch_disable_virtualization_cpu(); __this_cpu_write(virtualization_enabled, false); } From 5381eca101fd80a0f0ad028482181e6180e3561c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:55 -0700 Subject: [PATCH 088/352] KVM: MIPS: Rename virtualization {en,dis}abling APIs to match common KVM Rename MIPS's trampoline hooks for virtualization enabling to match the recently renamed arch hooks. No functional change intended. Signed-off-by: Sean Christopherson Message-ID: <20240830043600.127750-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 4 ++-- arch/mips/kvm/mips.c | 4 ++-- arch/mips/kvm/vz.c | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 6743a57c1ab43..f7222eb594ea6 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -728,8 +728,8 @@ struct kvm_mips_callbacks { int (*handle_fpe)(struct kvm_vcpu *vcpu); int (*handle_msa_disabled)(struct kvm_vcpu *vcpu); int (*handle_guest_exit)(struct kvm_vcpu *vcpu); - int (*hardware_enable)(void); - void (*hardware_disable)(void); + int (*enable_virtualization_cpu)(void); + void (*disable_virtualization_cpu)(void); int (*check_extension)(struct kvm *kvm, long ext); int (*vcpu_init)(struct kvm_vcpu *vcpu); void (*vcpu_uninit)(struct kvm_vcpu *vcpu); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 52e1f275351e8..60b43ea85c125 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -127,12 +127,12 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) int kvm_arch_enable_virtualization_cpu(void) { - return kvm_mips_callbacks->hardware_enable(); + return kvm_mips_callbacks->enable_virtualization_cpu(); } void kvm_arch_disable_virtualization_cpu(void) { - kvm_mips_callbacks->hardware_disable(); + kvm_mips_callbacks->disable_virtualization_cpu(); } int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index 99d5a71e43000..ccab4d76b1265 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -2869,7 +2869,7 @@ static unsigned int kvm_vz_resize_guest_vtlb(unsigned int size) return ret + 1; } -static int kvm_vz_hardware_enable(void) +static int kvm_vz_enable_virtualization_cpu(void) { unsigned int mmu_size, guest_mmu_size, ftlb_size; u64 guest_cvmctl, cvmvmconfig; @@ -2983,7 +2983,7 @@ static int kvm_vz_hardware_enable(void) return 0; } -static void kvm_vz_hardware_disable(void) +static void kvm_vz_disable_virtualization_cpu(void) { u64 cvmvmconfig; unsigned int mmu_size; @@ -3280,8 +3280,8 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = { .handle_msa_disabled = kvm_trap_vz_handle_msa_disabled, .handle_guest_exit = kvm_trap_vz_handle_guest_exit, - .hardware_enable = kvm_vz_hardware_enable, - .hardware_disable = kvm_vz_hardware_disable, + .enable_virtualization_cpu = kvm_vz_enable_virtualization_cpu, + .disable_virtualization_cpu = kvm_vz_disable_virtualization_cpu, .check_extension = kvm_vz_check_extension, .vcpu_init = kvm_vz_vcpu_init, .vcpu_uninit = kvm_vz_vcpu_uninit, From 0617a769ce16b836659c8a712f394bfa3543a587 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:56 -0700 Subject: [PATCH 089/352] KVM: x86: Rename virtualization {en,dis}abling APIs to match common KVM Rename x86's the per-CPU vendor hooks used to enable virtualization in hardware to align with the recently renamed arch hooks. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Kai Huang Message-ID: <20240830043600.127750-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 4 ++-- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/svm/svm.c | 18 +++++++++--------- arch/x86/kvm/vmx/main.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 10 +++++----- arch/x86/kvm/vmx/x86_ops.h | 4 ++-- arch/x86/kvm/x86.c | 10 +++++----- 7 files changed, 27 insertions(+), 27 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 68ad4f923664e..03b7e13f15bbd 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -14,8 +14,8 @@ BUILD_BUG_ON(1) * be __static_call_return0. */ KVM_X86_OP(check_processor_compatibility) -KVM_X86_OP(hardware_enable) -KVM_X86_OP(hardware_disable) +KVM_X86_OP(enable_virtualization_cpu) +KVM_X86_OP(disable_virtualization_cpu) KVM_X86_OP(hardware_unsetup) KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e4fc362ba3da7..704aeecfe2c98 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1629,8 +1629,8 @@ struct kvm_x86_ops { int (*check_processor_compatibility)(void); - int (*hardware_enable)(void); - void (*hardware_disable)(void); + int (*enable_virtualization_cpu)(void); + void (*disable_virtualization_cpu)(void); void (*hardware_unsetup)(void); bool (*has_emulated_msr)(struct kvm *kvm, u32 index); void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d6f252555ab3f..a9adbe10c12e2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -592,14 +592,14 @@ static inline void kvm_cpu_svm_disable(void) } } -static void svm_emergency_disable(void) +static void svm_emergency_disable_virtualization_cpu(void) { kvm_rebooting = true; kvm_cpu_svm_disable(); } -static void svm_hardware_disable(void) +static void svm_disable_virtualization_cpu(void) { /* Make sure we clean up behind us */ if (tsc_scaling) @@ -610,7 +610,7 @@ static void svm_hardware_disable(void) amd_pmu_disable_virt(); } -static int svm_hardware_enable(void) +static int svm_enable_virtualization_cpu(void) { struct svm_cpu_data *sd; @@ -1533,7 +1533,7 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * TSC_AUX is always virtualized for SEV-ES guests when the feature is * available. The user return MSR support is not required in this case * because TSC_AUX is restored on #VMEXIT from the host save area - * (which has been initialized in svm_hardware_enable()). + * (which has been initialized in svm_enable_virtualization_cpu()). */ if (likely(tsc_aux_uret_slot >= 0) && (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) @@ -3132,7 +3132,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * feature is available. The user return MSR support is not * required in this case because TSC_AUX is restored on #VMEXIT * from the host save area (which has been initialized in - * svm_hardware_enable()). + * svm_enable_virtualization_cpu()). */ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) break; @@ -4980,8 +4980,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .check_processor_compatibility = svm_check_processor_compat, .hardware_unsetup = svm_hardware_unsetup, - .hardware_enable = svm_hardware_enable, - .hardware_disable = svm_hardware_disable, + .enable_virtualization_cpu = svm_enable_virtualization_cpu, + .disable_virtualization_cpu = svm_disable_virtualization_cpu, .has_emulated_msr = svm_has_emulated_msr, .vcpu_create = svm_vcpu_create, @@ -5411,7 +5411,7 @@ static void __svm_exit(void) { kvm_x86_vendor_exit(); - cpu_emergency_unregister_virt_callback(svm_emergency_disable); + cpu_emergency_unregister_virt_callback(svm_emergency_disable_virtualization_cpu); } static int __init svm_init(void) @@ -5427,7 +5427,7 @@ static int __init svm_init(void) if (r) return r; - cpu_emergency_register_virt_callback(svm_emergency_disable); + cpu_emergency_register_virt_callback(svm_emergency_disable_virtualization_cpu); /* * Common KVM initialization _must_ come last, after this, /dev/kvm is diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 0bf35ebe8a1bd..4a5bf92edccf5 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -23,8 +23,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .hardware_unsetup = vmx_hardware_unsetup, - .hardware_enable = vmx_hardware_enable, - .hardware_disable = vmx_hardware_disable, + .enable_virtualization_cpu = vmx_enable_virtualization_cpu, + .disable_virtualization_cpu = vmx_disable_virtualization_cpu, .has_emulated_msr = vmx_has_emulated_msr, .vm_size = sizeof(struct kvm_vmx), diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f18c2d8c7476e..cf7d937bfd2c5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -755,7 +755,7 @@ static int kvm_cpu_vmxoff(void) return -EIO; } -static void vmx_emergency_disable(void) +static void vmx_emergency_disable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; @@ -2844,7 +2844,7 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer) return -EFAULT; } -int vmx_hardware_enable(void) +int vmx_enable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); @@ -2881,7 +2881,7 @@ static void vmclear_local_loaded_vmcss(void) __loaded_vmcs_clear(v); } -void vmx_hardware_disable(void) +void vmx_disable_virtualization_cpu(void) { vmclear_local_loaded_vmcss(); @@ -8584,7 +8584,7 @@ static void __vmx_exit(void) { allow_smaller_maxphyaddr = false; - cpu_emergency_unregister_virt_callback(vmx_emergency_disable); + cpu_emergency_unregister_virt_callback(vmx_emergency_disable_virtualization_cpu); vmx_cleanup_l1d_flush(); } @@ -8632,7 +8632,7 @@ static int __init vmx_init(void) pi_init_cpu(cpu); } - cpu_emergency_register_virt_callback(vmx_emergency_disable); + cpu_emergency_register_virt_callback(vmx_emergency_disable_virtualization_cpu); vmx_check_vmcs12_offsets(); diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index ce3221cd1d01a..205692c43a8e0 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -13,8 +13,8 @@ extern struct kvm_x86_init_ops vt_init_ops __initdata; void vmx_hardware_unsetup(void); int vmx_check_processor_compat(void); -int vmx_hardware_enable(void); -void vmx_hardware_disable(void); +int vmx_enable_virtualization_cpu(void); +void vmx_disable_virtualization_cpu(void); int vmx_vm_init(struct kvm *kvm); void vmx_vm_destroy(struct kvm *kvm); int vmx_vcpu_precreate(struct kvm *kvm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1182baf0d4874..431358167fa83 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9749,7 +9749,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) guard(mutex)(&vendor_module_lock); - if (kvm_x86_ops.hardware_enable) { + if (kvm_x86_ops.enable_virtualization_cpu) { pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name); return -EEXIST; } @@ -9876,7 +9876,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) return 0; out_unwind_ops: - kvm_x86_ops.hardware_enable = NULL; + kvm_x86_ops.enable_virtualization_cpu = NULL; kvm_x86_call(hardware_unsetup)(); out_mmu_exit: kvm_mmu_vendor_module_exit(); @@ -9917,7 +9917,7 @@ void kvm_x86_vendor_exit(void) WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); #endif mutex_lock(&vendor_module_lock); - kvm_x86_ops.hardware_enable = NULL; + kvm_x86_ops.enable_virtualization_cpu = NULL; mutex_unlock(&vendor_module_lock); } EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); @@ -12528,7 +12528,7 @@ int kvm_arch_enable_virtualization_cpu(void) if (ret) return ret; - ret = kvm_x86_call(hardware_enable)(); + ret = kvm_x86_call(enable_virtualization_cpu)(); if (ret != 0) return ret; @@ -12610,7 +12610,7 @@ int kvm_arch_enable_virtualization_cpu(void) void kvm_arch_disable_virtualization_cpu(void) { - kvm_x86_call(hardware_disable)(); + kvm_x86_call(disable_virtualization_cpu)(); drop_user_return_notifiers(); } From b4886fab6fb620b96ad7eeefb9801c42dfa91741 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:57 -0700 Subject: [PATCH 090/352] KVM: Add a module param to allow enabling virtualization when KVM is loaded Add an on-by-default module param, enable_virt_at_load, to let userspace force virtualization to be enabled in hardware when KVM is initialized, i.e. just before /dev/kvm is exposed to userspace. Enabling virtualization during KVM initialization allows userspace to avoid the additional latency when creating/destroying the first/last VM (or more specifically, on the 0=>1 and 1=>0 edges of creation/destruction). Now that KVM uses the cpuhp framework to do per-CPU enabling, the latency could be non-trivial as the cpuhup bringup/teardown is serialized across CPUs, e.g. the latency could be problematic for use case that need to spin up VMs quickly. Prior to commit 10474ae8945c ("KVM: Activate Virtualization On Demand"), KVM _unconditionally_ enabled virtualization during load, i.e. there's no fundamental reason KVM needs to dynamically toggle virtualization. These days, the only known argument for not enabling virtualization is to allow KVM to be autoloaded without blocking other out-of-tree hypervisors, and such use cases can simply change the module param, e.g. via command line. Note, the aforementioned commit also mentioned that enabling SVM (AMD's virtualization extensions) can result in "using invalid TLB entries". It's not clear whether the changelog was referring to a KVM bug, a CPU bug, or something else entirely. Regardless, leaving virtualization off by default is not a robust "fix", as any protection provided is lost the instant userspace creates the first VM. Reviewed-by: Chao Gao Acked-by: Kai Huang Reviewed-by: Kai Huang Tested-by: Farrah Chen Signed-off-by: Sean Christopherson Message-ID: <20240830043600.127750-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../admin-guide/kernel-parameters.txt | 17 +++++++++ virt/kvm/kvm_main.c | 35 +++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 09126bb8cc9ff..1b52b1b7bbc4a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2648,6 +2648,23 @@ Default is Y (on). + kvm.enable_virt_at_load=[KVM,ARM64,LOONGARCH,MIPS,RISCV,X86] + If enabled, KVM will enable virtualization in hardware + when KVM is loaded, and disable virtualization when KVM + is unloaded (if KVM is built as a module). + + If disabled, KVM will dynamically enable and disable + virtualization on-demand when creating and destroying + VMs, i.e. on the 0=>1 and 1=>0 transitions of the + number of VMs. + + Enabling virtualization at module lode avoids potential + latency for creation of the 0=>1 VM, as KVM serializes + virtualization enabling across all online CPUs. The + "cost" of enabling virtualization when KVM is loaded, + is that doing so may interfere with using out-of-tree + hypervisors that want to "own" virtualization hardware. + kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface. Default is false (don't support). diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7ada2b669d51c..3a18da68b0ca1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5571,6 +5571,9 @@ static struct miscdevice kvm_dev = { }; #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +static bool enable_virt_at_load = true; +module_param(enable_virt_at_load, bool, 0444); + __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); @@ -5721,15 +5724,39 @@ static void kvm_disable_virtualization(void) unregister_syscore_ops(&kvm_syscore_ops); cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); } + +static int kvm_init_virtualization(void) +{ + if (enable_virt_at_load) + return kvm_enable_virtualization(); + + return 0; +} + +static void kvm_uninit_virtualization(void) +{ + if (enable_virt_at_load) + kvm_disable_virtualization(); +} #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ static int kvm_enable_virtualization(void) { return 0; } +static int kvm_init_virtualization(void) +{ + return 0; +} + static void kvm_disable_virtualization(void) { +} + +static void kvm_uninit_virtualization(void) +{ + } #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ @@ -6474,6 +6501,10 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) kvm_gmem_init(module); + r = kvm_init_virtualization(); + if (r) + goto err_virt; + /* * Registration _must_ be the very last thing done, as this exposes * /dev/kvm to userspace, i.e. all infrastructure must be setup! @@ -6487,6 +6518,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) return 0; err_register: + kvm_uninit_virtualization(); +err_virt: kvm_vfio_ops_exit(); err_vfio: kvm_async_pf_deinit(); @@ -6512,6 +6545,8 @@ void kvm_exit(void) */ misc_deregister(&kvm_dev); + kvm_uninit_virtualization(); + debugfs_remove_recursive(kvm_debugfs_dir); for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); From b67107a251b01161956892352d53fb122346eda1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:58 -0700 Subject: [PATCH 091/352] KVM: Add arch hooks for enabling/disabling virtualization Add arch hooks that are invoked when KVM enables/disable virtualization. x86 will use the hooks to register an "emergency disable" callback, which is essentially an x86-specific shutdown notifier that is used when the kernel is doing an emergency reboot/shutdown/kexec. Add comments for the declarations to help arch code understand exactly when the callbacks are invoked. Alternatively, the APIs themselves could communicate most of the same info, but kvm_arch_pre_enable_virtualization() and kvm_arch_post_disable_virtualization() are a bit cumbersome, and make it a bit less obvious that they are intended to be implemented as a pair. Reviewed-by: Chao Gao Reviewed-by: Kai Huang Acked-by: Kai Huang Tested-by: Farrah Chen Signed-off-by: Sean Christopherson Message-ID: <20240830043600.127750-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 14 ++++++++++++++ virt/kvm/kvm_main.c | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4ceecba64f932..dfd6ad66efcc2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1521,6 +1521,20 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {} #endif #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +/* + * kvm_arch_{enable,disable}_virtualization() are called on one CPU, under + * kvm_usage_lock, immediately after/before 0=>1 and 1=>0 transitions of + * kvm_usage_count, i.e. at the beginning of the generic hardware enabling + * sequence, and at the end of the generic hardware disabling sequence. + */ +void kvm_arch_enable_virtualization(void); +void kvm_arch_disable_virtualization(void); +/* + * kvm_arch_{enable,disable}_virtualization_cpu() are called on "every" CPU to + * do the actual twiddling of hardware bits. The hooks are called on all + * online CPUs when KVM enables/disabled virtualization, and on a single CPU + * when that CPU is onlined/offlined (including for Resume/Suspend). + */ int kvm_arch_enable_virtualization_cpu(void); void kvm_arch_disable_virtualization_cpu(void); #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3a18da68b0ca1..5f9a66df7bfbb 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5581,6 +5581,16 @@ static DEFINE_PER_CPU(bool, virtualization_enabled); static DEFINE_MUTEX(kvm_usage_lock); static int kvm_usage_count; +__weak void kvm_arch_enable_virtualization(void) +{ + +} + +__weak void kvm_arch_disable_virtualization(void) +{ + +} + static int kvm_enable_virtualization_cpu(void) { if (__this_cpu_read(virtualization_enabled)) @@ -5680,6 +5690,8 @@ static int kvm_enable_virtualization(void) if (kvm_usage_count++) return 0; + kvm_arch_enable_virtualization(); + r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", kvm_online_cpu, kvm_offline_cpu); if (r) @@ -5710,6 +5722,7 @@ static int kvm_enable_virtualization(void) unregister_syscore_ops(&kvm_syscore_ops); cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); err_cpuhp: + kvm_arch_disable_virtualization(); --kvm_usage_count; return r; } @@ -5723,6 +5736,7 @@ static void kvm_disable_virtualization(void) unregister_syscore_ops(&kvm_syscore_ops); cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); + kvm_arch_disable_virtualization(); } static int kvm_init_virtualization(void) From 6d55a94222dbc64754fc138dbe4578dc5cac1c8b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:59 -0700 Subject: [PATCH 092/352] x86/reboot: Unconditionally define cpu_emergency_virt_cb typedef Define cpu_emergency_virt_cb even if the kernel is being built without KVM support so that KVM can reference the typedef in asm/kvm_host.h without needing yet more #ifdefs. No functional change intended. Acked-by: Kai Huang Reviewed-by: Chao Gao Reviewed-by: Kai Huang Tested-by: Farrah Chen Signed-off-by: Sean Christopherson Message-ID: <20240830043600.127750-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/reboot.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 6536873f8fc0a..d0ef2a678d66c 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,8 +25,8 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 -#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) typedef void (cpu_emergency_virt_cb)(void); +#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_disable_virtualization(void); From 590b09b1d88e18ae57f89930a6f7b89795d2e9f3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:36:00 -0700 Subject: [PATCH 093/352] KVM: x86: Register "emergency disable" callbacks when virt is enabled Register the "disable virtualization in an emergency" callback just before KVM enables virtualization in hardware, as there is no functional need to keep the callbacks registered while KVM happens to be loaded, but is inactive, i.e. if KVM hasn't enabled virtualization. Note, unregistering the callback every time the last VM is destroyed could have measurable latency due to the synchronize_rcu() needed to ensure all references to the callback are dropped before KVM is unloaded. But the latency should be a small fraction of the total latency of disabling virtualization across all CPUs, and userspace can set enable_virt_at_load to completely eliminate the runtime overhead. Add a pointer in kvm_x86_ops to allow vendor code to provide its callback. There is no reason to force vendor code to do the registration, and either way KVM would need a new kvm_x86_ops hook. Suggested-by: Kai Huang Reviewed-by: Chao Gao Reviewed-by: Kai Huang Acked-by: Kai Huang Tested-by: Farrah Chen Signed-off-by: Sean Christopherson Message-ID: <20240830043600.127750-11-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/svm/svm.c | 5 +---- arch/x86/kvm/vmx/main.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 6 +----- arch/x86/kvm/vmx/x86_ops.h | 1 + arch/x86/kvm/x86.c | 10 ++++++++++ 6 files changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 704aeecfe2c98..52443ccda320f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -36,6 +36,7 @@ #include #include #include +#include #define __KVM_HAVE_ARCH_VCPU_DEBUGFS @@ -1631,6 +1632,8 @@ struct kvm_x86_ops { int (*enable_virtualization_cpu)(void); void (*disable_virtualization_cpu)(void); + cpu_emergency_virt_cb *emergency_disable_virtualization_cpu; + void (*hardware_unsetup)(void); bool (*has_emulated_msr)(struct kvm *kvm, u32 index); void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a9adbe10c12e2..9a0506ef87dfb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4982,6 +4982,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .hardware_unsetup = svm_hardware_unsetup, .enable_virtualization_cpu = svm_enable_virtualization_cpu, .disable_virtualization_cpu = svm_disable_virtualization_cpu, + .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu, .has_emulated_msr = svm_has_emulated_msr, .vcpu_create = svm_vcpu_create, @@ -5410,8 +5411,6 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { static void __svm_exit(void) { kvm_x86_vendor_exit(); - - cpu_emergency_unregister_virt_callback(svm_emergency_disable_virtualization_cpu); } static int __init svm_init(void) @@ -5427,8 +5426,6 @@ static int __init svm_init(void) if (r) return r; - cpu_emergency_register_virt_callback(svm_emergency_disable_virtualization_cpu); - /* * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 4a5bf92edccf5..7e2e78a142574 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -25,6 +25,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .enable_virtualization_cpu = vmx_enable_virtualization_cpu, .disable_virtualization_cpu = vmx_disable_virtualization_cpu, + .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu, + .has_emulated_msr = vmx_has_emulated_msr, .vm_size = sizeof(struct kvm_vmx), diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cf7d937bfd2c5..89682832dded7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -755,7 +755,7 @@ static int kvm_cpu_vmxoff(void) return -EIO; } -static void vmx_emergency_disable_virtualization_cpu(void) +void vmx_emergency_disable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; @@ -8584,8 +8584,6 @@ static void __vmx_exit(void) { allow_smaller_maxphyaddr = false; - cpu_emergency_unregister_virt_callback(vmx_emergency_disable_virtualization_cpu); - vmx_cleanup_l1d_flush(); } @@ -8632,8 +8630,6 @@ static int __init vmx_init(void) pi_init_cpu(cpu); } - cpu_emergency_register_virt_callback(vmx_emergency_disable_virtualization_cpu); - vmx_check_vmcs12_offsets(); /* diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 205692c43a8e0..b6a7cfc6ae317 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -15,6 +15,7 @@ void vmx_hardware_unsetup(void); int vmx_check_processor_compat(void); int vmx_enable_virtualization_cpu(void); void vmx_disable_virtualization_cpu(void); +void vmx_emergency_disable_virtualization_cpu(void); int vmx_vm_init(struct kvm *kvm); void vmx_vm_destroy(struct kvm *kvm); int vmx_vcpu_precreate(struct kvm *kvm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 431358167fa83..f72e5d89e942d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12512,6 +12512,16 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) } EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); +void kvm_arch_enable_virtualization(void) +{ + cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); +} + +void kvm_arch_disable_virtualization(void) +{ + cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); +} + int kvm_arch_enable_virtualization_cpu(void) { struct kvm *kvm; From 4b30051c4864234ec57290c3d142db7c88f10d8a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Sep 2024 11:09:07 +0200 Subject: [PATCH 094/352] static_call: Handle module init failure correctly in static_call_del_module() Module insertion invokes static_call_add_module() to initialize the static calls in a module. static_call_add_module() invokes __static_call_init(), which allocates a struct static_call_mod to either encapsulate the built-in static call sites of the associated key into it so further modules can be added or to append the module to the module chain. If that allocation fails the function returns with an error code and the module core invokes static_call_del_module() to clean up eventually added static_call_mod entries. This works correctly, when all keys used by the module were converted over to a module chain before the failure. If not then static_call_del_module() causes a #GP as it blindly assumes that key::mods points to a valid struct static_call_mod. The problem is that key::mods is not a individual struct member of struct static_call_key, it's part of a union to save space: union { /* bit 0: 0 = mods, 1 = sites */ unsigned long type; struct static_call_mod *mods; struct static_call_site *sites; }; key::sites is a pointer to the list of built-in usage sites of the static call. The type of the pointer is differentiated by bit 0. A mods pointer has the bit clear, the sites pointer has the bit set. As static_call_del_module() blidly assumes that the pointer is a valid static_call_mod type, it fails to check for this failure case and dereferences the pointer to the list of built-in call sites, which is obviously bogus. Cure it by checking whether the key has a sites or a mods pointer. If it's a sites pointer then the key is not to be touched. As the sites are walked in the same order as in __static_call_init() the site walk can be terminated because all subsequent sites have not been touched by the init code due to the error exit. If it was converted before the allocation fail, then the inner loop which searches for a module match will find nothing. A fail in the second allocation in __static_call_init() is harmless and does not require special treatment. The first allocation succeeded and converted the key to a module chain. That first entry has mod::mod == NULL and mod::next == NULL, so the inner loop of static_call_del_module() will neither find a module match nor a module chain. The next site in the walk was either already converted, but can't match the module, or it will exit the outer loop because it has a static_call_site pointer and not a static_call_mod pointer. Fixes: 9183c3f9ed71 ("static_call: Add inline static call infrastructure") Closes: https://lore.kernel.org/all/20230915082126.4187913-1-ruanjinjie@huawei.com Reported-by: Jinjie Ruan Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Jinjie Ruan Link: https://lore.kernel.org/r/87zfon6b0s.ffs@tglx --- kernel/static_call_inline.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index 639397b5491ca..7bb0962b52291 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -411,6 +411,17 @@ static void static_call_del_module(struct module *mod) for (site = start; site < stop; site++) { key = static_call_key(site); + + /* + * If the key was not updated due to a memory allocation + * failure in __static_call_init() then treating key::sites + * as key::mods in the code below would cause random memory + * access and #GP. In that case all subsequent sites have + * not been touched either, so stop iterating. + */ + if (!static_call_key_has_mods(key)) + break; + if (key == prev_key) continue; From fe513c2ef0a172a58f158e2e70465c4317f0a9a2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Sep 2024 11:08:28 +0200 Subject: [PATCH 095/352] static_call: Replace pointless WARN_ON() in static_call_module_notify() static_call_module_notify() triggers a WARN_ON(), when memory allocation fails in __static_call_add_module(). That's not really justified, because the failure case must be correctly handled by the well known call chain and the error code is passed through to the initiating userspace application. A memory allocation fail is not a fatal problem, but the WARN_ON() takes the machine out when panic_on_warn is set. Replace it with a pr_warn(). Fixes: 9183c3f9ed71 ("static_call: Add inline static call infrastructure") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/8734mf7pmb.ffs@tglx --- kernel/static_call_inline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index 7bb0962b52291..5259cda486d05 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -453,7 +453,7 @@ static int static_call_module_notify(struct notifier_block *nb, case MODULE_STATE_COMING: ret = static_call_add_module(mod); if (ret) { - WARN(1, "Failed to allocate memory for static calls"); + pr_warn("Failed to allocate memory for static calls\n"); static_call_del_module(mod); } break; From 3f6821aa147b6e6fe07e8b35999724518b74a632 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Sep 2024 09:13:37 -0700 Subject: [PATCH 096/352] KVM: x86: Forcibly leave nested if RSM to L2 hits shutdown Leave nested mode before synthesizing shutdown (a.k.a. TRIPLE_FAULT) if RSM fails when resuming L2 (a.k.a. guest mode). Architecturally, shutdown on RSM occurs _before_ the transition back to guest mode on both Intel and AMD. On Intel, per the SDM pseudocode, SMRAM state is loaded before critical VMX state: restore state normally from SMRAM; ... CR4.VMXE := value stored internally; IF internal storage indicates that the logical processor had been in VMX operation (root or non-root) THEN enter VMX operation (root or non-root); restore VMX-critical state as defined in Section 32.14.1; ... restore current VMCS pointer; FI; AMD's APM is both less clearcut and more explicit. Because AMD CPUs save VMCB and guest state in SMRAM itself, given the lack of anything in the APM to indicate a shutdown in guest mode is possible, a straightforward reading of the clause on invalid state is that _what_ state is invalid is irrelevant, i.e. all roads lead to shutdown. An RSM causes a processor shutdown if an invalid-state condition is found in the SMRAM state-save area. This fixes a bug found by syzkaller where synthesizing shutdown for L2 led to a nested VM-Exit (if L1 is intercepting shutdown), which in turn caused KVM to complain about trying to cancel a nested VM-Enter (see commit 759cbd59674a ("KVM: x86: nSVM/nVMX: set nested_run_pending on VM entry which is a result of RSM"). Note, Paolo pointed out that KVM shouldn't set nested_run_pending until after loading SMRAM state. But as above, that's only half the story, KVM shouldn't transition to guest mode either. Unfortunately, fixing that mess requires rewriting the nVMX and nSVM RSM flows to not piggyback their nested VM-Enter flows, as executing the nested VM-Enter flows after loading state from SMRAM would clobber much of said state. For now, add a FIXME to call out that transitioning to guest mode before loading state from SMRAM is wrong. Link: https://lore.kernel.org/all/CABgObfYaUHXyRmsmg8UjRomnpQ0Jnaog9-L2gMjsjkqChjDYUQ@mail.gmail.com Reported-by: syzbot+988d9efcdf137bc05f66@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/0000000000007a9acb06151e1670@google.com Reported-by: Zheyu Ma Closes: https://lore.kernel.org/all/CAMhUBjmXMYsEoVYw_M8hSZjBMHh24i88QYm-RY6HDta5YZ7Wgw@mail.gmail.com Analyzed-by: Michal Wilczynski Cc: Kishen Maloor Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20240906161337.1118412-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/smm.c | 24 +++++++++++++++++++----- arch/x86/kvm/x86.c | 6 ------ arch/x86/kvm/x86.h | 6 ++++++ 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 00e3c27d2a876..85241c0c7f569 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -624,17 +624,31 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) #endif /* - * Give leave_smm() a chance to make ISA-specific changes to the vCPU - * state (e.g. enter guest mode) before loading state from the SMM - * state-save area. + * FIXME: When resuming L2 (a.k.a. guest mode), the transition to guest + * mode should happen _after_ loading state from SMRAM. However, KVM + * piggybacks the nested VM-Enter flows (which is wrong for many other + * reasons), and so nSVM/nVMX would clobber state that is loaded from + * SMRAM and from the VMCS/VMCB. */ if (kvm_x86_call(leave_smm)(vcpu, &smram)) return X86EMUL_UNHANDLEABLE; #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - return rsm_load_state_64(ctxt, &smram.smram64); + ret = rsm_load_state_64(ctxt, &smram.smram64); else #endif - return rsm_load_state_32(ctxt, &smram.smram32); + ret = rsm_load_state_32(ctxt, &smram.smram32); + + /* + * If RSM fails and triggers shutdown, architecturally the shutdown + * occurs *before* the transition to guest mode. But due to KVM's + * flawed handling of RSM to L2 (see above), the vCPU may already be + * in_guest_mode(). Force the vCPU out of guest mode before delivering + * the shutdown, so that L1 enters shutdown instead of seeing a VM-Exit + * that architecturally shouldn't be possible. + */ + if (ret != X86EMUL_CONTINUE && is_guest_mode(vcpu)) + kvm_leave_nested(vcpu); + return ret; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa455a60b5574..92fade53c79f9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -833,12 +833,6 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto ex->payload = payload; } -/* Forcibly leave the nested mode in cases like a vCPU reset */ -static void kvm_leave_nested(struct kvm_vcpu *vcpu) -{ - kvm_x86_ops.nested_ops->leave_nested(vcpu); -} - static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error, u32 error_code, bool has_payload, unsigned long payload, bool reinject) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 516eb9e28752e..121f5c19613ef 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -108,6 +108,12 @@ static inline unsigned int __shrink_ple_window(unsigned int val, void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); int kvm_check_nested_events(struct kvm_vcpu *vcpu); +/* Forcibly leave the nested mode in cases like a vCPU reset */ +static inline void kvm_leave_nested(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops.nested_ops->leave_nested(vcpu); +} + static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) { return vcpu->arch.last_vmentry_cpu != -1; From c32e028057f144f15c06e2f09dfec49b14311910 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:44:48 -0700 Subject: [PATCH 097/352] KVM: selftests: Verify single-stepping a fastpath VM-Exit exits to userspace In x86's debug_regs test, change the RDMSR(MISC_ENABLES) in the single-step testcase to a WRMSR(TSC_DEADLINE) in order to verify that KVM honors KVM_GUESTDBG_SINGLESTEP when handling a fastpath VM-Exit. Note, the extra coverage is effectively Intel-only, as KVM only handles TSC_DEADLINE in the fastpath when the timer is emulated via the hypervisor timer, a.k.a. the VMX preemption timer. Link: https://lore.kernel.org/r/20240830044448.130449-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/debug_regs.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c index f6b295e0b2d2b..76cc2df9238a3 100644 --- a/tools/testing/selftests/kvm/x86_64/debug_regs.c +++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c @@ -47,15 +47,18 @@ static void guest_code(void) /* * Single step test, covers 2 basic instructions and 2 emulated * - * Enable interrupts during the single stepping to see that - * pending interrupt we raised is not handled due to KVM_GUESTDBG_BLOCKIRQ + * Enable interrupts during the single stepping to see that pending + * interrupt we raised is not handled due to KVM_GUESTDBG_BLOCKIRQ. + * + * Write MSR_IA32_TSC_DEADLINE to verify that KVM's fastpath handler + * exits to userspace due to single-step being enabled. */ asm volatile("ss_start: " "sti\n\t" "xor %%eax,%%eax\n\t" "cpuid\n\t" - "movl $0x1a0,%%ecx\n\t" - "rdmsr\n\t" + "movl $" __stringify(MSR_IA32_TSC_DEADLINE) ", %%ecx\n\t" + "wrmsr\n\t" "cli\n\t" : : : "eax", "ebx", "ecx", "edx"); From 4ca077f26d885cbc97e742a5f3572aac244a0f8a Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Fri, 30 Aug 2024 10:25:37 +0800 Subject: [PATCH 098/352] KVM: x86: Remove some unused declarations Commit 238adc77051a ("KVM: Cleanup LAPIC interface") removed kvm_lapic_get_base() but leave declaration. And other two declarations were never implenmented since introduction. Signed-off-by: Yue Haibing Link: https://lore.kernel.org/r/20240830022537.2403873-1-yuehaibing@huawei.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.h | 1 - arch/x86/kvm/mmu.h | 2 -- arch/x86/kvm/mmu/mmu_internal.h | 2 -- 3 files changed, 5 deletions(-) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 7ef8ae73e82d3..7c95eedd771e7 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -96,7 +96,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); -u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_recalculate_apic_map(struct kvm *kvm); void kvm_apic_set_version(struct kvm_vcpu *vcpu); void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 4341e0e285712..9dc5dd43ae7f2 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -223,8 +223,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool kvm_mmu_may_ignore_guest_pat(void); -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); - int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 1721d97743e99..1469a1d9782df 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -349,8 +349,6 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); -void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); - void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); From 7efb4d8a392a18e37fcdb5e77c111af6e9a9e2f2 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 6 Sep 2024 00:08:37 +1200 Subject: [PATCH 099/352] KVM: VMX: Also clear SGX EDECCSSA in KVM CPU caps when SGX is disabled When SGX EDECCSSA support was added to KVM in commit 16a7fe3728a8 ("KVM/VMX: Allow exposing EDECCSSA user leaf function to KVM guest"), it forgot to clear the X86_FEATURE_SGX_EDECCSSA bit in KVM CPU caps when KVM SGX is disabled. Fix it. Fixes: 16a7fe3728a8 ("KVM/VMX: Allow exposing EDECCSSA user leaf function to KVM guest") Signed-off-by: Kai Huang Link: https://lore.kernel.org/r/20240905120837.579102-1-kai.huang@intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 594db9afbc0fb..98d737e0d416b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7963,6 +7963,7 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); kvm_cpu_cap_clear(X86_FEATURE_SGX1); kvm_cpu_cap_clear(X86_FEATURE_SGX2); + kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA); } if (vmx_umip_emulated()) From a194a3a13ce0b4cce4b52f328405891ef3a85cb9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 5 Sep 2024 21:34:07 -0700 Subject: [PATCH 100/352] KVM: x86: Move "ack" phase of local APIC IRQ delivery to separate API Split the "ack" phase, i.e. the movement of an interrupt from IRR=>ISR, out of kvm_get_apic_interrupt() and into a separate API so that nested VMX can acknowledge a specific interrupt _after_ emulating a VM-Exit from L2 to L1. To correctly emulate nested posted interrupts while APICv is active, KVM must: 1. find the highest pending interrupt. 2. check if that IRQ is L2's notification vector 3. emulate VM-Exit if the IRQ is NOT the notification vector 4. ACK the IRQ in L1 _after_ VM-Exit When APICv is active, the process of moving the IRQ from the IRR to the ISR also requires a VMWRITE to update vmcs01.GUEST_INTERRUPT_STATUS.SVI, and so acknowledging the interrupt before switching to vmcs01 would result in marking the IRQ as in-service in the wrong VMCS. KVM currently fudges around this issue by doing kvm_get_apic_interrupt() smack dab in the middle of emulating VM-Exit, but that hack doesn't play nice with nested posted interrupts, as notification vector IRQs don't trigger a VM-Exit in the first place. Cc: Nathan Chancellor Link: https://lore.kernel.org/r/20240906043413.1049633-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 17 +++++++++++++---- arch/x86/kvm/lapic.h | 1 + 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5bb481aefcbcd..c1b2a615acfd6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2922,14 +2922,13 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) } } -int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) +void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector) { - int vector = kvm_apic_has_interrupt(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; u32 ppr; - if (vector == -1) - return -1; + if (WARN_ON_ONCE(vector < 0 || !apic)) + return; /* * We get here even with APIC virtualization enabled, if doing @@ -2957,6 +2956,16 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) __apic_update_ppr(apic, &ppr); } +} +EXPORT_SYMBOL_GPL(kvm_apic_ack_interrupt); + +int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) +{ + int vector = kvm_apic_has_interrupt(vcpu); + + if (vector != -1) + kvm_apic_ack_interrupt(vcpu, vector); + return vector; } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 7ef8ae73e82d3..db80a2965b9c5 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -88,6 +88,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu); void kvm_free_lapic(struct kvm_vcpu *vcpu); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); +void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); int kvm_apic_accept_events(struct kvm_vcpu *vcpu); From 363010e1dd0efd4778637c1a5a5aaffbcfcae919 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 5 Sep 2024 21:34:08 -0700 Subject: [PATCH 101/352] KVM: nVMX: Get to-be-acknowledge IRQ for nested VM-Exit at injection site Move the logic to get the to-be-acknowledge IRQ for a nested VM-Exit from nested_vmx_vmexit() to vmx_check_nested_events(), which is subtly the one and only path where KVM invokes nested_vmx_vmexit() with EXIT_REASON_EXTERNAL_INTERRUPT. A future fix will perform a last-minute check on L2's nested posted interrupt notification vector, just before injecting a nested VM-Exit. To handle that scenario correctly, KVM needs to get the interrupt _before_ injecting VM-Exit, as simply querying the highest priority interrupt, via kvm_cpu_has_interrupt(), would result in TOCTOU bug, as a new, higher priority interrupt could arrive between kvm_cpu_has_interrupt() and kvm_cpu_get_interrupt(). Unfortunately, simply moving the call to kvm_cpu_get_interrupt() doesn't suffice, as a VMWRITE to GUEST_INTERRUPT_STATUS.SVI is hiding in kvm_get_apic_interrupt(), and acknowledging the interrupt before nested VM-Exit would cause the VMWRITE to hit vmcs02 instead of vmcs01. Open code a rough equivalent to kvm_cpu_get_interrupt() so that the IRQ is acknowledged after emulating VM-Exit, taking care to avoid the TOCTOU issue described above. Opportunistically convert the WARN_ON() to a WARN_ON_ONCE(). If KVM has a bug that results in a false positive from kvm_cpu_has_interrupt(), spamming dmesg won't help the situation. Note, nested_vmx_reflect_vmexit() can never reflect external interrupts as they are always "wanted" by L0. Link: https://lore.kernel.org/r/20240906043413.1049633-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/irq.c | 3 ++- arch/x86/kvm/vmx/nested.c | 36 ++++++++++++++++++++++++--------- 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4a93ac1b9be91..aa31c4b949774 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2256,6 +2256,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_cpu_has_extint(struct kvm_vcpu *v); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); +int kvm_cpu_get_extint(struct kvm_vcpu *v); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 3d7eb11d0e456..810da99ff7edc 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -108,7 +108,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); * Read pending interrupt(from non-APIC source) * vector and intack. */ -static int kvm_cpu_get_extint(struct kvm_vcpu *v) +int kvm_cpu_get_extint(struct kvm_vcpu *v) { if (!kvm_cpu_has_extint(v)) { WARN_ON(!lapic_in_kernel(v)); @@ -131,6 +131,7 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v) } else return kvm_pic_read_irq(v->kvm); /* PIC */ } +EXPORT_SYMBOL_GPL(kvm_cpu_get_extint); /* * Read pending interrupt vector and intack. diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 867de342df33d..1a6dc85cde180 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4285,11 +4285,37 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) } if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { + int irq; + if (block_nested_events) return -EBUSY; if (!nested_exit_on_intr(vcpu)) goto no_vmexit; - nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + + if (!nested_exit_intr_ack_set(vcpu)) { + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + return 0; + } + + irq = kvm_cpu_get_extint(vcpu); + if (irq != -1) { + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); + return 0; + } + + irq = kvm_apic_has_interrupt(vcpu); + WARN_ON_ONCE(irq < 0); + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); + + /* + * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must + * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI + * if APICv is active. + */ + kvm_apic_ack_interrupt(vcpu, irq); return 0; } @@ -4970,14 +4996,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; if (likely(!vmx->fail)) { - if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && - nested_exit_intr_ack_set(vcpu)) { - int irq = kvm_cpu_get_interrupt(vcpu); - WARN_ON(irq < 0); - vmcs12->vm_exit_intr_info = irq | - INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; - } - if (vm_exit_reason != -1) trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, vmcs12->exit_qualification, From 8c23670f2b0004edb8f7135e314114f0c3452085 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 5 Sep 2024 21:34:09 -0700 Subject: [PATCH 102/352] KVM: nVMX: Suppress external interrupt VM-Exit injection if there's no IRQ In the should-be-impossible scenario that kvm_cpu_get_interrupt() doesn't return a valid vector after checking kvm_cpu_has_interrupt(), skip VM-Exit injection to reduce the probability of crashing/confusing L1. Now that KVM gets the IRQ _before_ calling nested_vmx_vmexit(), squashing the VM-Exit injection is trivial since there are no actions that need to be undone. Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20240906043413.1049633-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1a6dc85cde180..e57cd849354b0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4305,7 +4305,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) } irq = kvm_apic_has_interrupt(vcpu); - WARN_ON_ONCE(irq < 0); + if (WARN_ON_ONCE(irq < 0)) + goto no_vmexit; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); From 6e0b456547f41bafdad2dc3a72adc7f69326ca4c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 5 Sep 2024 21:34:10 -0700 Subject: [PATCH 103/352] KVM: nVMX: Detect nested posted interrupt NV at nested VM-Exit injection When synthensizing a nested VM-Exit due to an external interrupt, pend a nested posted interrupt if the external interrupt vector matches L2's PI notification vector, i.e. if the interrupt is a PI notification for L2. This fixes a bug where KVM will incorrectly inject VM-Exit instead of processing nested posted interrupt when IPI virtualization is enabled. Per the SDM, detection of the notification vector doesn't occur until the interrupt is acknowledge and deliver to the CPU core. If the external-interrupt exiting VM-execution control is 1, any unmasked external interrupt causes a VM exit (see Section 26.2). If the "process posted interrupts" VM-execution control is also 1, this behavior is changed and the processor handles an external interrupt as follows: 1. The local APIC is acknowledged; this provides the processor core with an interrupt vector, called here the physical vector. 2. If the physical vector equals the posted-interrupt notification vector, the logical processor continues to the next step. Otherwise, a VM exit occurs as it would normally due to an external interrupt; the vector is saved in the VM-exit interruption-information field. For the most part, KVM has avoided problems because a PI NV for L2 that arrives will L2 is active will be processed by hardware, and KVM checks for a pending notification vector during nested VM-Enter. Thus, to hit the bug, the PI NV interrupt needs to sneak its way into L1's vIRR while L2 is active. Without IPI virtualization, the scenario is practically impossible to hit, modulo L1 doing weird things (see below), as the ordering between vmx_deliver_posted_interrupt() and nested VM-Enter effectively guarantees that either the sender will see the vCPU as being in_guest_mode(), or the receiver will see the interrupt in its vIRR. With IPI virtualization, introduced by commit d588bb9be1da ("KVM: VMX: enable IPI virtualization"), the sending CPU effectively implements a rough equivalent of vmx_deliver_posted_interrupt(), sans the nested PI NV check. If the target vCPU has a valid PID, the CPU will send a PI NV interrupt based on _L1's_ PID, as the sender's because IPIv table points at L1 PIDs. PIR := 32 bytes at PID_ADDR; // under lock PIR[V] := 1; store PIR at PID_ADDR; // release lock NotifyInfo := 8 bytes at PID_ADDR + 32; // under lock IF NotifyInfo.ON = 0 AND NotifyInfo.SN = 0; THEN NotifyInfo.ON := 1; SendNotify := 1; ELSE SendNotify := 0; FI; store NotifyInfo at PID_ADDR + 32; // release lock IF SendNotify = 1; THEN send an IPI specified by NotifyInfo.NDST and NotifyInfo.NV; FI; As a result, the target vCPU ends up receiving an interrupt on KVM's POSTED_INTR_VECTOR while L2 is running, with an interrupt in L1's PIR for L2's nested PI NV. The POSTED_INTR_VECTOR interrupt triggers a VM-Exit from L2 to L0, KVM moves the interrupt from L1's PIR to vIRR, triggers a KVM_REQ_EVENT prior to re-entry to L2, and calls vmx_check_nested_events(), effectively bypassing all of KVM's "early" checks on nested PI NV. Without IPI virtualization, the bug can likely be hit only if L1 programs an assigned device to _post_ an interrupt to L2's notification vector, by way of L1's PID.PIR. Doing so would allow the interrupt to get into L1's vIRR without KVM checking vmcs12's NV. Which is architecturally allowed, but unlikely behavior for a hypervisor. Cc: Zeng Guang Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20240906043413.1049633-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e57cd849354b0..02d6a3a1e80fb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4308,6 +4308,20 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(irq < 0)) goto no_vmexit; + /* + * If the IRQ is L2's PI notification vector, process posted + * interrupts for L2 instead of injecting VM-Exit, as the + * detection/morphing architecturally occurs when the IRQ is + * delivered to the CPU. Note, only interrupts that are routed + * through the local APIC trigger posted interrupt processing, + * and enabling posted interrupts requires ACK-on-exit. + */ + if (irq == vmx->nested.posted_intr_nv) { + vmx->nested.pi_pending = true; + kvm_apic_clear_irr(vcpu, irq); + goto no_vmexit; + } + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); From aa9477966aabc344ae83555002bd31809f6a9546 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 5 Sep 2024 21:34:11 -0700 Subject: [PATCH 104/352] KVM: x86: Fold kvm_get_apic_interrupt() into kvm_cpu_get_interrupt() Fold kvm_get_apic_interrupt() into kvm_cpu_get_interrupt() now that nVMX essentially open codes kvm_get_apic_interrupt() in order to correctly emulate nested posted interrupts. Opportunistically stop exporting kvm_cpu_get_interrupt(), as the aforementioned nVMX flow was the only user in vendor code. Link: https://lore.kernel.org/r/20240906043413.1049633-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq.c | 7 +++++-- arch/x86/kvm/lapic.c | 10 ---------- arch/x86/kvm/lapic.h | 1 - 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 810da99ff7edc..63f66c51975a5 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -142,9 +142,12 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) if (vector != -1) return vector; /* PIC */ - return kvm_get_apic_interrupt(v); /* APIC */ + vector = kvm_apic_has_interrupt(v); /* APIC */ + if (vector != -1) + kvm_apic_ack_interrupt(v, vector); + + return vector; } -EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c1b2a615acfd6..63e67b6301ec6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2959,16 +2959,6 @@ void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector) } EXPORT_SYMBOL_GPL(kvm_apic_ack_interrupt); -int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) -{ - int vector = kvm_apic_has_interrupt(vcpu); - - if (vector != -1) - kvm_apic_ack_interrupt(vcpu, vector); - - return vector; -} - static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s, bool set) { diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index db80a2965b9c5..8310ff74be297 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -90,7 +90,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); -int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); int kvm_apic_accept_events(struct kvm_vcpu *vcpu); void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); From 1ed0f119c5ff66bec663ba5507539ec4a4f33775 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 5 Sep 2024 21:34:12 -0700 Subject: [PATCH 105/352] KVM: nVMX: Explicitly invalidate posted_intr_nv if PI is disabled at VM-Enter Explicitly invalidate posted_intr_nv when emulating nested VM-Enter and posted interrupts are disabled to make it clear that posted_intr_nv is valid if and only if nested posted interrupts are enabled, and as a cheap way to harden against KVM bugs. KVM initializes posted_intr_nv to -1 at vCPU creation and resets it to -1 when unloading vmcs12 and/or leaving nested mode, i.e. this is not a bug fix (or at least, it's not intended to be a bug fix). Note, tracking nested.posted_intr_nv as a u16 subtly adds a measure of safety, as it prevents unintentionally matching KVM's informal "no IRQ" vector of -1, stored as a signed int. Because a u16 can be always be represented as a signed int, the effective "invalid" value of posted_intr_nv, 65535, will be preserved as-is when comparing against an int, i.e. will be zero-extended, not sign-extended, and thus won't get a false positive if KVM is buggy and compares posted_intr_nv against -1. Opportunistically add a comment in vmx_deliver_nested_posted_interrupt() to call out that it must check vmx->nested.posted_intr_nv, not the vector in vmcs12, which is presumably the _entire_ reason nested.posted_intr_nv exists. E.g. vmcs12 is a KVM-controlled snapshot, so there are no TOCTOU races to worry about, the only potential badness is if the vCPU leaves nested and frees vmcs12 between the sender checking is_guest_mode() and dereferencing the vmcs12 pointer. Link: https://lore.kernel.org/r/20240906043413.1049633-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 6 ++++-- arch/x86/kvm/vmx/vmx.c | 7 +++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 02d6a3a1e80fb..fc3d2ba036f60 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2317,10 +2317,12 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 /* Posted interrupts setting is only taken from vmcs12. */ vmx->nested.pi_pending = false; - if (nested_cpu_has_posted_intr(vmcs12)) + if (nested_cpu_has_posted_intr(vmcs12)) { vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; - else + } else { + vmx->nested.posted_intr_nv = -1; exec_control &= ~PIN_BASED_POSTED_INTR; + } pin_controls_set(vmx, exec_control); /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 98d737e0d416b..fe99deceebbd1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4215,6 +4215,13 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated + * and freed, and must not be accessed outside of vcpu->mutex. The + * vCPU's cached PI NV is valid if and only if posted interrupts + * enabled in its vmcs12, i.e. checking the vector also checks that + * L1 has enabled posted interrupts for L2. + */ if (is_guest_mode(vcpu) && vector == vmx->nested.posted_intr_nv) { /* From 3dde46a21aa72a3640bf3f6ff5ce7838af06a1f9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 5 Sep 2024 21:34:13 -0700 Subject: [PATCH 106/352] KVM: nVMX: Assert that vcpu->mutex is held when accessing secondary VMCSes Add lockdep assertions in get_vmcs12() and get_shadow_vmcs12() to verify the vCPU's mutex is held, as the returned VMCS objects are dynamically allocated/freed when nested VMX is turned on/off, i.e. accessing vmcs12 structures without holding vcpu->mutex is susceptible to use-after-free. Waive the assertion if the VM is being destroyed, as KVM currently forces a nested VM-Exit when freeing the vCPU. If/when that wart is fixed, the assertion can/should be converted to an unqualified lockdep assertion. See also https://lore.kernel.org/all/Zsd0TqCeY3B5Sb5b@google.com. Link: https://lore.kernel.org/r/20240906043413.1049633-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index cce4e2aa30fbf..668b6c83a373c 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -39,11 +39,17 @@ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { + lockdep_assert_once(lockdep_is_held(&vcpu->mutex) || + !refcount_read(&vcpu->kvm->users_count)); + return to_vmx(vcpu)->nested.cached_vmcs12; } static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) { + lockdep_assert_once(lockdep_is_held(&vcpu->mutex) || + !refcount_read(&vcpu->kvm->users_count)); + return to_vmx(vcpu)->nested.cached_shadow_vmcs12; } From ec495f2ab12290b008a691e826b39b895f458945 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 12:14:12 -0700 Subject: [PATCH 107/352] KVM: Write the per-page "segment" when clearing (part of) a guest page Pass "seg" instead of "len" when writing guest memory in kvm_clear_guest(), as "seg" holds the number of bytes to write for the current page, while "len" holds the total bytes remaining. Luckily, all users of kvm_clear_guest() are guaranteed to not cross a page boundary, and so the bug is unhittable in the current code base. Fixes: 2f5414423ef5 ("KVM: remove kvm_clear_guest_page") Reported-by: zyr_ms@outlook.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219104 Link: https://lore.kernel.org/r/20240829191413.900740-2-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cb2b78e92910f..04011b94edece 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3581,7 +3581,7 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) int ret; while ((seg = next_segment(len, offset)) != 0) { - ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len); + ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg); if (ret < 0) return ret; offset = 0; From 025dde582bbf31e7618f9283594ef5e2408e384b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 12:14:13 -0700 Subject: [PATCH 108/352] KVM: Harden guest memory APIs against out-of-bounds accesses When reading or writing a guest page, WARN and bail if offset+len would result in a read to a different page so that KVM bugs are more likely to be detected, and so that any such bugs are less likely to escalate to an out-of-bounds access. E.g. if userspace isn't using guard pages and the target page is at the end of a memslot. Note, KVM already hardens itself in similar APIs, e.g. in the "cached" variants, it's just the vanilla APIs that are playing with fire. Link: https://lore.kernel.org/r/20240829191413.900740-3-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/kvm_main.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 04011b94edece..d51357fd28d7b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3275,6 +3275,9 @@ static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, int r; unsigned long addr; + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) + return -EFAULT; + addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; @@ -3348,6 +3351,9 @@ static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, int r; unsigned long addr; + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) + return -EFAULT; + addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; @@ -3378,6 +3384,9 @@ static int __kvm_write_guest_page(struct kvm *kvm, int r; unsigned long addr; + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) + return -EFAULT; + addr = gfn_to_hva_memslot(memslot, gfn); if (kvm_is_error_hva(addr)) return -EFAULT; From 4ececec19a0914873634ad69bbaca5557c33e855 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:17 -0700 Subject: [PATCH 109/352] KVM: x86/mmu: Replace PFERR_NESTED_GUEST_PAGE with a more descriptive helper Drop the globally visible PFERR_NESTED_GUEST_PAGE and replace it with a more appropriately named is_write_to_guest_page_table(). The macro name is misleading, because while all nNPT walks match PAGE|WRITE|PRESENT, the reverse is not true. No functional change intended. Link: https://lore.kernel.org/r/20240831001538.336683-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 4 ---- arch/x86/kvm/mmu/mmu.c | 9 ++++++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4a68cb3eba78f..797433994d566 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -280,10 +280,6 @@ enum x86_intercept_stage; #define PFERR_PRIVATE_ACCESS BIT_ULL(49) #define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS) -#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ - PFERR_WRITE_MASK | \ - PFERR_PRESENT_MASK) - /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5226bb055d99c..1eedd1932e5f2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5945,6 +5945,13 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, write_unlock(&vcpu->kvm->mmu_lock); } +static bool is_write_to_guest_page_table(u64 error_code) +{ + const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK; + + return (error_code & mask) == mask; +} + int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { @@ -6008,7 +6015,7 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err * and resume the guest. */ if (vcpu->arch.mmu->root_role.direct && - (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { + is_write_to_guest_page_table(error_code)) { kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); return 1; } From 989a84c93f592e6b288fb3b96d2eeec827d75bef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:18 -0700 Subject: [PATCH 110/352] KVM: x86/mmu: Trigger unprotect logic only on write-protection page faults Trigger KVM's various "unprotect gfn" paths if and only if the page fault was a write to a write-protected gfn. To do so, add a new page fault return code, RET_PF_WRITE_PROTECTED, to explicitly and precisely track such page faults. If a page fault requires emulation for any MMIO (or any reason besides write-protection), trying to unprotect the gfn is pointless and risks putting the vCPU into an infinite loop. E.g. KVM will put the vCPU into an infinite loop if the vCPU manages to trigger MMIO on a page table walk. Fixes: 147277540bbc ("kvm: svm: Add support for additional SVM NPF error codes") Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 75 +++++++++++++++++++-------------- arch/x86/kvm/mmu/mmu_internal.h | 3 ++ arch/x86/kvm/mmu/mmutrace.h | 1 + arch/x86/kvm/mmu/paging_tmpl.h | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 6 +-- 5 files changed, 50 insertions(+), 37 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1eedd1932e5f2..d8deed29091ba 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2896,10 +2896,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, trace_kvm_mmu_set_spte(level, gfn, sptep); } - if (wrprot) { - if (write_fault) - ret = RET_PF_EMULATE; - } + if (wrprot && write_fault) + ret = RET_PF_WRITE_PROTECTED; if (flush) kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); @@ -4531,7 +4529,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return RET_PF_RETRY; if (page_fault_handle_page_track(vcpu, fault)) - return RET_PF_EMULATE; + return RET_PF_WRITE_PROTECTED; r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) @@ -4624,7 +4622,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, int r; if (page_fault_handle_page_track(vcpu, fault)) - return RET_PF_EMULATE; + return RET_PF_WRITE_PROTECTED; r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) @@ -4703,6 +4701,7 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, switch (r) { case RET_PF_FIXED: case RET_PF_SPURIOUS: + case RET_PF_WRITE_PROTECTED: return 0; case RET_PF_EMULATE: @@ -5952,6 +5951,40 @@ static bool is_write_to_guest_page_table(u64 error_code) return (error_code & mask) == mask; } +static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u64 error_code, int *emulation_type) +{ + bool direct = vcpu->arch.mmu->root_role.direct; + + /* + * Before emulating the instruction, check if the error code + * was due to a RO violation while translating the guest page. + * This can occur when using nested virtualization with nested + * paging in both guests. If true, we simply unprotect the page + * and resume the guest. + */ + if (direct && is_write_to_guest_page_table(error_code)) { + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); + return RET_PF_RETRY; + } + + /* + * The gfn is write-protected, but if emulation fails we can still + * optimistically try to just unprotect the page and let the processor + * re-execute the instruction that caused the page fault. Do not allow + * retrying MMIO emulation, as it's not only pointless but could also + * cause us to enter an infinite loop because the processor will keep + * faulting on the non-existent MMIO address. Retrying an instruction + * from a nested guest is also pointless and dangerous as we are only + * explicitly shadowing L1's page tables, i.e. unprotecting something + * for L1 isn't going to magically fix whatever issue cause L2 to fail. + */ + if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) + *emulation_type |= EMULTYPE_ALLOW_RETRY_PF; + + return RET_PF_EMULATE; +} + int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { @@ -5997,6 +6030,10 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err if (r < 0) return r; + if (r == RET_PF_WRITE_PROTECTED) + r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code, + &emulation_type); + if (r == RET_PF_FIXED) vcpu->stat.pf_fixed++; else if (r == RET_PF_EMULATE) @@ -6007,32 +6044,6 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err if (r != RET_PF_EMULATE) return 1; - /* - * Before emulating the instruction, check if the error code - * was due to a RO violation while translating the guest page. - * This can occur when using nested virtualization with nested - * paging in both guests. If true, we simply unprotect the page - * and resume the guest. - */ - if (vcpu->arch.mmu->root_role.direct && - is_write_to_guest_page_table(error_code)) { - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); - return 1; - } - - /* - * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still - * optimistically try to just unprotect the page and let the processor - * re-execute the instruction that caused the page fault. Do not allow - * retrying MMIO emulation, as it's not only pointless but could also - * cause us to enter an infinite loop because the processor will keep - * faulting on the non-existent MMIO address. Retrying an instruction - * from a nested guest is also pointless and dangerous as we are only - * explicitly shadowing L1's page tables, i.e. unprotecting something - * for L1 isn't going to magically fix whatever issue cause L2 to fail. - */ - if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) - emulation_type |= EMULTYPE_ALLOW_RETRY_PF; emulate: return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 1721d97743e99..50d2624111f8c 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -258,6 +258,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); * RET_PF_CONTINUE: So far, so good, keep handling the page fault. * RET_PF_RETRY: let CPU fault again on the address. * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. + * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the + * gfn and retry, or emulate the instruction directly. * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. @@ -274,6 +276,7 @@ enum { RET_PF_CONTINUE = 0, RET_PF_RETRY, RET_PF_EMULATE, + RET_PF_WRITE_PROTECTED, RET_PF_INVALID, RET_PF_FIXED, RET_PF_SPURIOUS, diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 195d98bc8de85..f35a830ce469b 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -57,6 +57,7 @@ TRACE_DEFINE_ENUM(RET_PF_CONTINUE); TRACE_DEFINE_ENUM(RET_PF_RETRY); TRACE_DEFINE_ENUM(RET_PF_EMULATE); +TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED); TRACE_DEFINE_ENUM(RET_PF_INVALID); TRACE_DEFINE_ENUM(RET_PF_FIXED); TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 405bd7ceee2a3..ae7d39ff2d07f 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -806,7 +806,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (page_fault_handle_page_track(vcpu, fault)) { shadow_page_table_clear_flood(vcpu, fault->addr); - return RET_PF_EMULATE; + return RET_PF_WRITE_PROTECTED; } r = mmu_topup_memory_caches(vcpu, true); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c7dc49ee73887..8bf44ac9372f2 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1046,10 +1046,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, * protected, emulation is needed. If the emulation was skipped, * the vCPU would have the same fault again. */ - if (wrprot) { - if (fault->write) - ret = RET_PF_EMULATE; - } + if (wrprot && fault->write) + ret = RET_PF_WRITE_PROTECTED; /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) { From 2fb2b7877b3a4cac4de070ef92437b38f13559b0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:19 -0700 Subject: [PATCH 111/352] KVM: x86/mmu: Skip emulation on page fault iff 1+ SPs were unprotected When doing "fast unprotection" of nested TDP page tables, skip emulation if and only if at least one gfn was unprotected, i.e. continue with emulation if simply resuming is likely to hit the same fault and risk putting the vCPU into an infinite loop. Note, it's entirely possible to get a false negative, e.g. if a different vCPU faults on the same gfn and unprotects the gfn first, but that's a relatively rare edge case, and emulating is still functionally ok, i.e. saving a few cycles by avoiding emulation isn't worth the risk of putting the vCPU into an infinite loop. Opportunistically rewrite the relevant comment to document in gory detail exactly what scenario the "fast unprotect" logic is handling. Fixes: 147277540bbc ("kvm: svm: Add support for additional SVM NPF error codes") Cc: Yuan Yao Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d8deed29091ba..edca32a4e8746 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5957,16 +5957,37 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool direct = vcpu->arch.mmu->root_role.direct; /* - * Before emulating the instruction, check if the error code - * was due to a RO violation while translating the guest page. - * This can occur when using nested virtualization with nested - * paging in both guests. If true, we simply unprotect the page - * and resume the guest. + * Before emulating the instruction, check to see if the access was due + * to a read-only violation while the CPU was walking non-nested NPT + * page tables, i.e. for a direct MMU, for _guest_ page tables in L1. + * If L1 is sharing (a subset of) its page tables with L2, e.g. by + * having nCR3 share lower level page tables with hCR3, then when KVM + * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also + * unknowingly write-protecting L1's guest page tables, which KVM isn't + * shadowing. + * + * Because the CPU (by default) walks NPT page tables using a write + * access (to ensure the CPU can do A/D updates), page walks in L1 can + * trigger write faults for the above case even when L1 isn't modifying + * PTEs. As a result, KVM will unnecessarily emulate (or at least, try + * to emulate) an excessive number of L1 instructions; because L1's MMU + * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs + * and thus no need to emulate in order to guarantee forward progress. + * + * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can + * proceed without triggering emulation. If one or more shadow pages + * was zapped, skip emulation and resume L1 to let it natively execute + * the instruction. If no shadow pages were zapped, then the write- + * fault is due to something else entirely, i.e. KVM needs to emulate, + * as resuming the guest will put it into an infinite loop. + * + * Note, this code also applies to Intel CPUs, even though it is *very* + * unlikely that an L1 will share its page tables (IA32/PAE/paging64 + * format) with L2's page tables (EPT format). */ - if (direct && is_write_to_guest_page_table(error_code)) { - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); + if (direct && is_write_to_guest_page_table(error_code) && + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa))) return RET_PF_RETRY; - } /* * The gfn is write-protected, but if emulation fails we can still From c1edcc41c3603c65f34000ae031a20971f4e56f9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:20 -0700 Subject: [PATCH 112/352] KVM: x86: Retry to-be-emulated insn in "slow" unprotect path iff sp is zapped Resume the guest and thus skip emulation of a non-PTE-writing instruction if and only if unprotecting the gfn actually zapped at least one shadow page. If the gfn is write-protected for some reason other than shadow paging, attempting to unprotect the gfn will effectively fail, and thus retrying the instruction is all but guaranteed to be pointless. This bug has existed for a long time, but was effectively fudged around by the retry RIP+address anti-loop detection. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 70219e4069874..5d4bcb999df24 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8965,14 +8965,14 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) return false; - vcpu->arch.last_retry_eip = ctxt->eip; - vcpu->arch.last_retry_addr = cr2_or_gpa; - if (!vcpu->arch.mmu->root_role.direct) gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + if (!kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa))) + return false; + vcpu->arch.last_retry_eip = ctxt->eip; + vcpu->arch.last_retry_addr = cr2_or_gpa; return true; } From 019f3f84a40c88b68ca4d455306b92c20733e784 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:21 -0700 Subject: [PATCH 113/352] KVM: x86: Get RIP from vCPU state when storing it to last_retry_eip Read RIP from vCPU state instead of pulling it from the emulation context when filling last_retry_eip, which is part of the anti-infinite-loop protection used when unprotecting and retrying instructions that hit a write-protected gfn. This will allow reusing the anti-infinite-loop protection in flows that never make it into the emulator. No functional change intended, as ctxt->eip is set to kvm_rip_read() in init_emulate_ctxt(), and EMULTYPE_PF emulation is mutually exclusive with EMULTYPE_NO_DECODE and EMULTYPE_SKIP, i.e. always goes through x86_decode_emulated_instruction() and hasn't advanced ctxt->eip (yet). Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5d4bcb999df24..760d455f77ac9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8971,7 +8971,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (!kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa))) return false; - vcpu->arch.last_retry_eip = ctxt->eip; + vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); vcpu->arch.last_retry_addr = cr2_or_gpa; return true; } From 9c19129e535bfff85bdfcb5a804e19e5aae935b2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:22 -0700 Subject: [PATCH 114/352] KVM: x86: Store gpa as gpa_t, not unsigned long, when unprotecting for retry Store the gpa used to unprotect the faulting gfn for retry as a gpa_t, not an unsigned long. This fixes a bug where 32-bit KVM would unprotect and retry the wrong gfn if the gpa had bits 63:32!=0. In practice, this bug is functionally benign, as unprotecting the wrong gfn is purely a performance issue (thanks to the anti-infinite-loop logic). And of course, almost no one runs 32-bit KVM these days. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 760d455f77ac9..d26e107225f7f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8932,7 +8932,8 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, gpa_t cr2_or_gpa, int emulation_type) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa; + unsigned long last_retry_eip, last_retry_addr; + gpa_t gpa = cr2_or_gpa; last_retry_eip = vcpu->arch.last_retry_eip; last_retry_addr = vcpu->arch.last_retry_addr; From 01dd4d319207c4cfd51a1c9a1812909e944d8c86 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:23 -0700 Subject: [PATCH 115/352] KVM: x86/mmu: Apply retry protection to "fast nTDP unprotect" path Move the anti-infinite-loop protection provided by last_retry_{eip,addr} into kvm_mmu_write_protect_fault() so that it guards unprotect+retry that never hits the emulator, as well as reexecute_instruction(), which is the last ditch "might as well try it" logic that kicks in when emulation fails on an instruction that faulted on a write-protected gfn. Add a new helper, kvm_mmu_unprotect_gfn_and_retry(), to set the retry fields and deduplicate other code (with more to come). Link: https://lore.kernel.org/r/20240831001538.336683-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 39 ++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 27 +---------------------- 3 files changed, 40 insertions(+), 27 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 797433994d566..fd115feb49b38 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2133,6 +2133,7 @@ int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); +bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa); void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index edca32a4e8746..e992da21f4e51 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2713,6 +2713,22 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) return r; } +bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa) +{ + gpa_t gpa = cr2_or_gpa; + bool r; + + if (!vcpu->arch.mmu->root_role.direct) + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); + + r = kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + if (r) { + vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); + vcpu->arch.last_retry_addr = cr2_or_gpa; + } + return r; +} + static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) { gpa_t gpa; @@ -5956,6 +5972,27 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, { bool direct = vcpu->arch.mmu->root_role.direct; + /* + * Do not try to unprotect and retry if the vCPU re-faulted on the same + * RIP with the same address that was previously unprotected, as doing + * so will likely put the vCPU into an infinite. E.g. if the vCPU uses + * a non-page-table modifying instruction on the PDE that points to the + * instruction, then unprotecting the gfn will unmap the instruction's + * code, i.e. make it impossible for the instruction to ever complete. + */ + if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) && + vcpu->arch.last_retry_addr == cr2_or_gpa) + return RET_PF_EMULATE; + + /* + * Reset the unprotect+retry values that guard against infinite loops. + * The values will be refreshed if KVM explicitly unprotects a gfn and + * retries, in all other cases it's safe to retry in the future even if + * the next page fault happens on the same RIP+address. + */ + vcpu->arch.last_retry_eip = 0; + vcpu->arch.last_retry_addr = 0; + /* * Before emulating the instruction, check to see if the access was due * to a read-only violation while the CPU was walking non-nested NPT @@ -5986,7 +6023,7 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * format) with L2's page tables (EPT format). */ if (direct && is_write_to_guest_page_table(error_code) && - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa))) + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) return RET_PF_RETRY; /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d26e107225f7f..ce075e07126bf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8932,27 +8932,13 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, gpa_t cr2_or_gpa, int emulation_type) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long last_retry_eip, last_retry_addr; - gpa_t gpa = cr2_or_gpa; - - last_retry_eip = vcpu->arch.last_retry_eip; - last_retry_addr = vcpu->arch.last_retry_addr; /* * If the emulation is caused by #PF and it is non-page_table * writing instruction, it means the VM-EXIT is caused by shadow * page protected, we can zap the shadow page and retry this * instruction directly. - * - * Note: if the guest uses a non-page-table modifying instruction - * on the PDE that points to the instruction, then we will unmap - * the instruction and go to an infinite loop. So, we cache the - * last retried eip and the last fault address, if we meet the eip - * and the address again, we can break out of the potential infinite - * loop. */ - vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; @@ -8963,18 +8949,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (x86_page_table_writing_insn(ctxt)) return false; - if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) - return false; - - if (!vcpu->arch.mmu->root_role.direct) - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); - - if (!kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa))) - return false; - - vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); - vcpu->arch.last_retry_addr = cr2_or_gpa; - return true; + return kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa); } static int complete_emulated_mmio(struct kvm_vcpu *vcpu); From dfaae8447c53819749cf3ba10ce24d3c609752e3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:24 -0700 Subject: [PATCH 116/352] KVM: x86/mmu: Try "unprotect for retry" iff there are indirect SPs Try to unprotect shadow pages if and only if indirect_shadow_pages is non- zero, i.e. iff there is at least one protected such shadow page. Pre- checking indirect_shadow_pages avoids taking mmu_lock for write when the gfn is write-protected by a third party, i.e. not for KVM shadow paging, and in the *extremely* unlikely case that a different task has already unprotected the last shadow page. Link: https://lore.kernel.org/r/20240831001538.336683-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e992da21f4e51..e24de1f3d1db1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2718,6 +2718,17 @@ bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa) gpa_t gpa = cr2_or_gpa; bool r; + /* + * Bail early if there aren't any write-protected shadow pages to avoid + * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked + * by a third party. Reading indirect_shadow_pages without holding + * mmu_lock is safe, as this is purely an optimization, i.e. a false + * positive is benign, and a false negative will simply result in KVM + * skipping the unprotect+retry path, which is also an optimization. + */ + if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) + return false; + if (!vcpu->arch.mmu->root_role.direct) gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); From 41e6e367d576ce1801dc5c2b106e14cde35e3c80 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:25 -0700 Subject: [PATCH 117/352] KVM: x86: Move EMULTYPE_ALLOW_RETRY_PF to x86_emulate_instruction() Move the sanity checks for EMULTYPE_ALLOW_RETRY_PF to the top of x86_emulate_instruction(). In addition to deduplicating a small amount of code, this makes the connection between EMULTYPE_ALLOW_RETRY_PF and EMULTYPE_PF even more explicit, and will allow dropping retry_instruction() entirely. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ce075e07126bf..51d22d9de8d7e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8870,10 +8870,6 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu)) || - WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) - return false; - if (!vcpu->arch.mmu->root_role.direct) { /* * Write permission should be allowed since only @@ -8942,10 +8938,6 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu)) || - WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) - return false; - if (x86_page_table_writing_insn(ctxt)) return false; @@ -9148,6 +9140,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; + if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) && + (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))) + emulation_type &= ~EMULTYPE_ALLOW_RETRY_PF; + r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len); if (r != X86EMUL_CONTINUE) { if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT) From 2df354e37c1398a85bb43cbbf1f913eb3f91d035 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:26 -0700 Subject: [PATCH 118/352] KVM: x86: Fold retry_instruction() into x86_emulate_instruction() Now that retry_instruction() is reasonably tiny, fold it into its sole caller, x86_emulate_instruction(). In addition to getting rid of the absurdly confusing retry_instruction() name, handling the retry in x86_emulate_instruction() pairs it back up with the code that resets last_retry_{eip,address}. No functional change intended. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-12-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 51d22d9de8d7e..a7961f8a64293 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8924,26 +8924,6 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP); } -static bool retry_instruction(struct x86_emulate_ctxt *ctxt, - gpa_t cr2_or_gpa, int emulation_type) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - /* - * If the emulation is caused by #PF and it is non-page_table - * writing instruction, it means the VM-EXIT is caused by shadow - * page protected, we can zap the shadow page and retry this - * instruction directly. - */ - if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) - return false; - - if (x86_page_table_writing_insn(ctxt)) - return false; - - return kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa); -} - static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); @@ -9223,7 +9203,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return 1; } - if (retry_instruction(ctxt, cr2_or_gpa, emulation_type)) + /* + * If emulation was caused by a write-protection #PF on a non-page_table + * writing instruction, try to unprotect the gfn, i.e. zap shadow pages, + * and retry the instruction, as the vCPU is likely no longer using the + * gfn as a page table. + */ + if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) && + !x86_page_table_writing_insn(ctxt) && + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) return 1; /* this is needed for vmware backdoor interface to work since it From b7e948898e772ac900950c0dac4ca90e905cd0c0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:27 -0700 Subject: [PATCH 119/352] KVM: x86/mmu: Don't try to unprotect an INVALID_GPA If getting the gpa for a gva fails, e.g. because the gva isn't mapped in the guest page tables, don't try to unprotect the invalid gfn. This is mostly a performance fix (avoids unnecessarily taking mmu_lock), as for_each_gfn_valid_sp_with_gptes() won't explode on garbage input, it's simply pointless. Link: https://lore.kernel.org/r/20240831001538.336683-13-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e24de1f3d1db1..bafec04b07ea1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2729,8 +2729,11 @@ bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa) if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) return false; - if (!vcpu->arch.mmu->root_role.direct) + if (!vcpu->arch.mmu->root_role.direct) { gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); + if (gpa == INVALID_GPA) + return false; + } r = kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); if (r) { @@ -2749,6 +2752,8 @@ static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) return 0; gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); + if (gpa == INVALID_GPA) + return 0; r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); From 29e495bdf847ac6ad0e0d03e5db39a3ed9f12858 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:28 -0700 Subject: [PATCH 120/352] KVM: x86/mmu: Always walk guest PTEs with WRITE access when unprotecting When getting a gpa from a gva to unprotect the associated gfn when an event is awating reinjection, walk the guest PTEs for WRITE as there's no point in unprotecting the gfn if the guest is unable to write the page, i.e. if write-protection can't trigger emulation. Note, the entire flow should be guarded on the access being a write, and even better should be conditioned on actually triggering a write-protect fault. This will be addressed in a future commit. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-14-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index bafec04b07ea1..937fa9a82a439 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2751,7 +2751,7 @@ static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) if (vcpu->arch.mmu->root_role.direct) return 0; - gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, NULL); if (gpa == INVALID_GPA) return 0; From b299c273c06f005976cdc1b9e9299d492527607e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:29 -0700 Subject: [PATCH 121/352] KVM: x86/mmu: Move event re-injection unprotect+retry into common path Move the event re-injection unprotect+retry logic into kvm_mmu_write_protect_fault(), i.e. unprotect and retry if and only if the #PF actually hit a write-protected gfn. Note, there is a small possibility that the gfn was unprotected by a different tasking between hitting the #PF and acquiring mmu_lock, but in that case, KVM will resume the guest immediately anyways because KVM will treat the fault as spurious. As a bonus, unprotecting _after_ handling the page fault also addresses the case where the installing a SPTE to handle fault encounters a shadowed PTE, i.e. *creates* a read-only SPTE. Opportunstically add a comment explaining what on earth the intent of the code is, as based on the changelog from commit 577bdc496614 ("KVM: Avoid instruction emulation when event delivery is pending"). Link: https://lore.kernel.org/r/20240831001538.336683-15-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 937fa9a82a439..195ba7430720b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2743,23 +2743,6 @@ bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa) return r; } -static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa; - int r; - - if (vcpu->arch.mmu->root_role.direct) - return 0; - - gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, NULL); - if (gpa == INVALID_GPA) - return 0; - - r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); - - return r; -} - static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { trace_kvm_mmu_unsync_page(sp); @@ -4630,8 +4613,6 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, if (!flags) { trace_kvm_page_fault(vcpu, fault_address, error_code); - if (kvm_event_needs_reinjection(vcpu)) - kvm_mmu_unprotect_page_virt(vcpu, fault_address); r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, insn_len); } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { @@ -6037,8 +6018,15 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * Note, this code also applies to Intel CPUs, even though it is *very* * unlikely that an L1 will share its page tables (IA32/PAE/paging64 * format) with L2's page tables (EPT format). - */ - if (direct && is_write_to_guest_page_table(error_code) && + * + * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to + * unprotect the gfn and retry if an event is awaiting reinjection. If + * KVM emulates multiple instructions before completing event injection, + * the event could be delayed beyond what is architecturally allowed, + * e.g. KVM could inject an IRQ after the TPR has been raised. + */ + if (((direct && is_write_to_guest_page_table(error_code)) || + (!direct && kvm_event_needs_reinjection(vcpu))) && kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) return RET_PF_RETRY; From 620525739521376a65a690df899e1596d56791f8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:30 -0700 Subject: [PATCH 122/352] KVM: x86: Remove manual pfn lookup when retrying #PF after failed emulation Drop the manual pfn look when retrying an instruction that KVM failed to emulation in response to a #PF due to a write-protected gfn. Now that KVM sets EMULTYPE_ALLOW_RETRY_PF if and only if the page fault hit a write- protected gfn, i.e. if and only if there's a writable memslot, there's no need to redo the lookup to avoid retrying an instruction that failed on emulated MMIO (no slot, or a write to a read-only slot). I.e. KVM will never attempt to retry an instruction that failed on emulated MMIO, whereas that was not the case prior to the introduction of RET_PF_WRITE_PROTECTED. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-16-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a7961f8a64293..1e9c5ef4a9f58 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8865,7 +8865,6 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type) { gpa_t gpa = cr2_or_gpa; - kvm_pfn_t pfn; if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; @@ -8885,23 +8884,6 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return true; } - /* - * Do not retry the unhandleable instruction if it faults on the - * readonly host memory, otherwise it will goto a infinite loop: - * retry instruction -> write #PF -> emulation fail -> retry - * instruction -> ... - */ - pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); - - /* - * If the instruction failed on the error pfn, it can not be fixed, - * report the error to userspace. - */ - if (is_error_noslot_pfn(pfn)) - return false; - - kvm_release_pfn_clean(pfn); - /* * If emulation may have been triggered by a write to a shadowed page * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the From 19ab2c8be070160be70a88027b3b93106fef7b89 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:31 -0700 Subject: [PATCH 123/352] KVM: x86: Check EMULTYPE_WRITE_PF_TO_SP before unprotecting gfn Don't bother unprotecting the target gfn if EMULTYPE_WRITE_PF_TO_SP is set, as KVM will simply report the emulation failure to userspace. This will allow converting reexecute_instruction() to use kvm_mmu_unprotect_gfn_instead_retry() instead of kvm_mmu_unprotect_page(). Link: https://lore.kernel.org/r/20240831001538.336683-17-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1e9c5ef4a9f58..7170eee23597a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8869,6 +8869,19 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; + /* + * If the failed instruction faulted on an access to page tables that + * are used to translate any part of the instruction, KVM can't resolve + * the issue by unprotecting the gfn, as zapping the shadow page will + * result in the instruction taking a !PRESENT page fault and thus put + * the vCPU into an infinite loop of page faults. E.g. KVM will create + * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and + * then zap the SPTE to unprotect the gfn, and then do it all over + * again. Report the error to userspace. + */ + if (emulation_type & EMULTYPE_WRITE_PF_TO_SP) + return false; + if (!vcpu->arch.mmu->root_role.direct) { /* * Write permission should be allowed since only @@ -8894,16 +8907,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); /* - * If the failed instruction faulted on an access to page tables that - * are used to translate any part of the instruction, KVM can't resolve - * the issue by unprotecting the gfn, as zapping the shadow page will - * result in the instruction taking a !PRESENT page fault and thus put - * the vCPU into an infinite loop of page faults. E.g. KVM will create - * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and - * then zap the SPTE to unprotect the gfn, and then do it all over - * again. Report the error to userspace. + * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible + * all SPTEs were already zapped by a different task. The alternative + * is to report the error to userspace and likely terminate the guest, + * and the last_retry_{eip,addr} checks will prevent retrying the page + * fault indefinitely, i.e. there's nothing to lose by retrying. */ - return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP); + return true; } static int complete_emulated_mmio(struct kvm_vcpu *vcpu); From dabc4ff70c35756bc107bc5d035d0f0746396a9a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:32 -0700 Subject: [PATCH 124/352] KVM: x86: Apply retry protection to "unprotect on failure" path Use kvm_mmu_unprotect_gfn_and_retry() in reexecute_instruction() to pick up protection against infinite loops, e.g. if KVM somehow manages to encounter an unsupported instruction and unprotecting the gfn doesn't allow the vCPU to make forward progress. Other than that, the retry-on- failure logic is a functionally equivalent, open coded version of kvm_mmu_unprotect_gfn_and_retry(). Note, the emulation failure path still isn't fully protected, as KVM won't update the retry protection fields if no shadow pages are zapped (but this change is still a step forward). That flaw will be addressed in a future patch. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-18-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7170eee23597a..ad942892fa2c5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8864,8 +8864,6 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type) { - gpa_t gpa = cr2_or_gpa; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; @@ -8882,29 +8880,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (emulation_type & EMULTYPE_WRITE_PF_TO_SP) return false; - if (!vcpu->arch.mmu->root_role.direct) { - /* - * Write permission should be allowed since only - * write access need to be emulated. - */ - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); - - /* - * If the mapping is invalid in guest, let cpu retry - * it to generate fault. - */ - if (gpa == INVALID_GPA) - return true; - } - /* * If emulation may have been triggered by a write to a shadowed page * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the * guest to let the CPU re-execute the instruction in the hope that the * CPU can cleanly execute the instruction that KVM failed to emulate. */ - if (vcpu->kvm->arch.indirect_shadow_pages) - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa); /* * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible From 4df685664bed04794ad72b58d8af1fa4fcc60261 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:33 -0700 Subject: [PATCH 125/352] KVM: x86: Update retry protection fields when forcing retry on emulation failure When retrying the faulting instruction after emulation failure, refresh the infinite loop protection fields even if no shadow pages were zapped, i.e. avoid hitting an infinite loop even when retrying the instruction as a last-ditch effort to avoid terminating the guest. Link: https://lore.kernel.org/r/20240831001538.336683-19-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 10 +++++++++- arch/x86/kvm/mmu/mmu.c | 12 +++++++----- arch/x86/kvm/x86.c | 2 +- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fd115feb49b38..cdee59f3d15be 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2133,7 +2133,15 @@ int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); -bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa); +bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + bool always_retry); + +static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, + gpa_t cr2_or_gpa) +{ + return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false); +} + void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 195ba7430720b..4b4edaf7dc068 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2713,10 +2713,11 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) return r; } -bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa) +bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + bool always_retry) { gpa_t gpa = cr2_or_gpa; - bool r; + bool r = false; /* * Bail early if there aren't any write-protected shadow pages to avoid @@ -2727,16 +2728,17 @@ bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa) * skipping the unprotect+retry path, which is also an optimization. */ if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) - return false; + goto out; if (!vcpu->arch.mmu->root_role.direct) { gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); if (gpa == INVALID_GPA) - return false; + goto out; } r = kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); - if (r) { +out: + if (r || always_retry) { vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); vcpu->arch.last_retry_addr = cr2_or_gpa; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ad942892fa2c5..843ddb982b35c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8886,7 +8886,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * guest to let the CPU re-execute the instruction in the hope that the * CPU can cleanly execute the instruction that KVM failed to emulate. */ - kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa); + __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, true); /* * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible From 2876624e1adcd9a3a3ffa8c4fe3bf8dbba969d95 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:34 -0700 Subject: [PATCH 126/352] KVM: x86: Rename reexecute_instruction()=>kvm_unprotect_and_retry_on_failure() Rename reexecute_instruction() to kvm_unprotect_and_retry_on_failure() to make the intent and purpose of the helper much more obvious. No functional change intended. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-20-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 843ddb982b35c..cd9725df36809 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8861,8 +8861,9 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) return 1; } -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - int emulation_type) +static bool kvm_unprotect_and_retry_on_failure(struct kvm_vcpu *vcpu, + gpa_t cr2_or_gpa, + int emulation_type) { if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; @@ -9129,8 +9130,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - if (reexecute_instruction(vcpu, cr2_or_gpa, - emulation_type)) + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, + emulation_type)) return 1; if (ctxt->have_exception && @@ -9216,7 +9217,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return 1; if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type)) + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, + emulation_type)) return 1; return handle_emulation_failure(vcpu, emulation_type); From 6b3dcabc10911711eba15816d808e2a18f130406 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:35 -0700 Subject: [PATCH 127/352] KVM: x86/mmu: Subsume kvm_mmu_unprotect_page() into the and_retry() version Fold kvm_mmu_unprotect_page() into kvm_mmu_unprotect_gfn_and_retry() now that all other direct usage is gone. No functional change intended. Link: https://lore.kernel.org/r/20240831001538.336683-21-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/mmu/mmu.c | 33 +++++++++++++-------------------- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index cdee59f3d15be..8f4164f58b6c1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2132,7 +2132,6 @@ int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); -int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool always_retry); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4b4edaf7dc068..29305403f9561 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2695,27 +2695,12 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) write_unlock(&kvm->mmu_lock); } -int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_mmu_page *sp; - LIST_HEAD(invalid_list); - int r; - - r = 0; - write_lock(&kvm->mmu_lock); - for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { - r = 1; - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - } - kvm_mmu_commit_zap_page(kvm, &invalid_list); - write_unlock(&kvm->mmu_lock); - - return r; -} - bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool always_retry) { + struct kvm *kvm = vcpu->kvm; + LIST_HEAD(invalid_list); + struct kvm_mmu_page *sp; gpa_t gpa = cr2_or_gpa; bool r = false; @@ -2727,7 +2712,7 @@ bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * positive is benign, and a false negative will simply result in KVM * skipping the unprotect+retry path, which is also an optimization. */ - if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) + if (!READ_ONCE(kvm->arch.indirect_shadow_pages)) goto out; if (!vcpu->arch.mmu->root_role.direct) { @@ -2736,7 +2721,15 @@ bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, goto out; } - r = kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + r = false; + write_lock(&kvm->mmu_lock); + for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa)) { + r = true; + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + } + kvm_mmu_commit_zap_page(kvm, &invalid_list); + write_unlock(&kvm->mmu_lock); + out: if (r || always_retry) { vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); From d859b16161c81ee929b7b02a85227b8e3250bc97 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:36 -0700 Subject: [PATCH 128/352] KVM: x86/mmu: Detect if unprotect will do anything based on invalid_list Explicitly query the list of to-be-zapped shadow pages when checking to see if unprotecting a gfn for retry has succeeded, i.e. if KVM should retry the faulting instruction. Add a comment to explain why the list needs to be checked before zapping, which is the primary motivation for this change. No functional change intended. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-22-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 29305403f9561..ebbdc979a0694 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2721,12 +2721,15 @@ bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, goto out; } - r = false; write_lock(&kvm->mmu_lock); - for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa)) { - r = true; + for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa)) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - } + + /* + * Snapshot the result before zapping, as zapping will remove all list + * entries, i.e. checking the list later would yield a false negative. + */ + r = !list_empty(&invalid_list); kvm_mmu_commit_zap_page(kvm, &invalid_list); write_unlock(&kvm->mmu_lock); From 98a69b96caca3e07aff57ca91fd7cc3a3853871a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:37 -0700 Subject: [PATCH 129/352] KVM: x86/mmu: WARN on MMIO cache hit when emulating write-protected gfn WARN if KVM gets an MMIO cache hit on a RET_PF_WRITE_PROTECTED fault, as KVM should return RET_PF_WRITE_PROTECTED if and only if there is a memslot, and creating a memslot is supposed to invalidate the MMIO cache by virtue of changing the memslot generation. Keep the code around mainly to provide a convenient location to document why emulated MMIO should be impossible. Suggested-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-23-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index ebbdc979a0694..330b87a1c80ad 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5988,6 +5988,18 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, vcpu->arch.last_retry_eip = 0; vcpu->arch.last_retry_addr = 0; + /* + * It should be impossible to reach this point with an MMIO cache hit, + * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid, + * writable memslot, and creating a memslot should invalidate the MMIO + * cache by way of changing the memslot generation. WARN and disallow + * retry if MMIO is detected, as retrying MMIO emulation is pointless + * and could put the vCPU into an infinite loop because the processor + * will keep faulting on the non-existent MMIO address. + */ + if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct))) + return RET_PF_EMULATE; + /* * Before emulating the instruction, check to see if the access was due * to a read-only violation while the CPU was walking non-nested NPT @@ -6029,17 +6041,15 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return RET_PF_RETRY; /* - * The gfn is write-protected, but if emulation fails we can still - * optimistically try to just unprotect the page and let the processor + * The gfn is write-protected, but if KVM detects its emulating an + * instruction that is unlikely to be used to modify page tables, or if + * emulation fails, KVM can try to unprotect the gfn and let the CPU * re-execute the instruction that caused the page fault. Do not allow - * retrying MMIO emulation, as it's not only pointless but could also - * cause us to enter an infinite loop because the processor will keep - * faulting on the non-existent MMIO address. Retrying an instruction - * from a nested guest is also pointless and dangerous as we are only - * explicitly shadowing L1's page tables, i.e. unprotecting something - * for L1 isn't going to magically fix whatever issue cause L2 to fail. - */ - if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) + * retrying an instruction from a nested guest as KVM is only explicitly + * shadowing L1's page tables, i.e. unprotecting something for L1 isn't + * going to magically fix whatever issue caused L2 to fail. + */ + if (!is_guest_mode(vcpu)) *emulation_type |= EMULTYPE_ALLOW_RETRY_PF; return RET_PF_EMULATE; From 0a37fffda14538036f5402a29920285284fe5d5b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:43:22 -0700 Subject: [PATCH 130/352] KVM: x86/mmu: Move walk_slot_rmaps() up near for_each_slot_rmap_range() Move walk_slot_rmaps() and friends up near for_each_slot_rmap_range() so that the walkers can be used to handle mmu_notifier invalidations, and so that similar function has some amount of locality in code. No functional change intended. Link: https://lore.kernel.org/r/20240809194335.1726916-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 106 ++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 330b87a1c80ad..17edf1499be74 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1516,6 +1516,59 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) slot_rmap_walk_okay(_iter_); \ slot_rmap_walk_next(_iter_)) +/* The return value indicates if tlb flush on all vcpus is needed. */ +typedef bool (*slot_rmaps_handler) (struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot); + +static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn, + bool flush_on_yield, bool flush) +{ + struct slot_rmap_walk_iterator iterator; + + lockdep_assert_held_write(&kvm->mmu_lock); + + for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, + end_gfn, &iterator) { + if (iterator.rmap) + flush |= fn(kvm, iterator.rmap, slot); + + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { + if (flush && flush_on_yield) { + kvm_flush_remote_tlbs_range(kvm, start_gfn, + iterator.gfn - start_gfn + 1); + flush = false; + } + cond_resched_rwlock_write(&kvm->mmu_lock); + } + } + + return flush; +} + +static __always_inline bool walk_slot_rmaps(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + int start_level, int end_level, + bool flush_on_yield) +{ + return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, + slot->base_gfn, slot->base_gfn + slot->npages - 1, + flush_on_yield, false); +} + +static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + bool flush_on_yield) +{ + return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); +} + typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level); @@ -6272,59 +6325,6 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, } EXPORT_SYMBOL_GPL(kvm_configure_mmu); -/* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_rmaps_handler) (struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot); - -static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, - const struct kvm_memory_slot *slot, - slot_rmaps_handler fn, - int start_level, int end_level, - gfn_t start_gfn, gfn_t end_gfn, - bool flush_on_yield, bool flush) -{ - struct slot_rmap_walk_iterator iterator; - - lockdep_assert_held_write(&kvm->mmu_lock); - - for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, - end_gfn, &iterator) { - if (iterator.rmap) - flush |= fn(kvm, iterator.rmap, slot); - - if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - if (flush && flush_on_yield) { - kvm_flush_remote_tlbs_range(kvm, start_gfn, - iterator.gfn - start_gfn + 1); - flush = false; - } - cond_resched_rwlock_write(&kvm->mmu_lock); - } - } - - return flush; -} - -static __always_inline bool walk_slot_rmaps(struct kvm *kvm, - const struct kvm_memory_slot *slot, - slot_rmaps_handler fn, - int start_level, int end_level, - bool flush_on_yield) -{ - return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, - slot->base_gfn, slot->base_gfn + slot->npages - 1, - flush_on_yield, false); -} - -static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, - const struct kvm_memory_slot *slot, - slot_rmaps_handler fn, - bool flush_on_yield) -{ - return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); -} - static void free_mmu_pages(struct kvm_mmu *mmu) { if (!tdp_enabled && mmu->pae_root) From 5b1fb116e1a636701627a6eb202d17be93e8f7a8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:43:23 -0700 Subject: [PATCH 131/352] KVM: x86/mmu: Plumb a @can_yield parameter into __walk_slot_rmaps() Add a @can_yield param to __walk_slot_rmaps() to control whether or not dropping mmu_lock and conditionally rescheduling is allowed. This will allow using __walk_slot_rmaps() and thus cond_resched() to handle mmu_notifier invalidations, which usually allow blocking/yielding, but not when invoked by the OOM killer. Link: https://lore.kernel.org/r/20240809194335.1726916-12-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 17edf1499be74..e3adc934559d0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1526,7 +1526,8 @@ static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, slot_rmaps_handler fn, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn, - bool flush_on_yield, bool flush) + bool can_yield, bool flush_on_yield, + bool flush) { struct slot_rmap_walk_iterator iterator; @@ -1537,6 +1538,9 @@ static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, if (iterator.rmap) flush |= fn(kvm, iterator.rmap, slot); + if (!can_yield) + continue; + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { if (flush && flush_on_yield) { kvm_flush_remote_tlbs_range(kvm, start_gfn, @@ -1558,7 +1562,7 @@ static __always_inline bool walk_slot_rmaps(struct kvm *kvm, { return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, slot->base_gfn, slot->base_gfn + slot->npages - 1, - flush_on_yield, false); + true, flush_on_yield, false); } static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, @@ -6600,7 +6604,7 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true, flush); + start, end - 1, true, true, flush); } } @@ -6888,7 +6892,7 @@ static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, */ for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages, - level, level, start, end - 1, true, false); + level, level, start, end - 1, true, true, false); } /* Must be called with the mmu_lock held in write-mode. */ From dd9eaad744f4ed30913cca423439a1765a760c71 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:43:24 -0700 Subject: [PATCH 132/352] KVM: x86/mmu: Add a helper to walk and zap rmaps for a memslot Add a dedicated helper to walk and zap rmaps for a given memslot so that the code can be shared between KVM-initiated zaps and mmu_notifier invalidations. No functional change intended. Link: https://lore.kernel.org/r/20240809194335.1726916-13-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e3adc934559d0..70b043d7701dd 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1573,6 +1573,16 @@ static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); } +static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, bool can_yield, + bool flush) +{ + return __walk_slot_rmaps(kvm, slot, __kvm_zap_rmap, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, + start, end - 1, can_yield, true, flush); +} + typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level); @@ -6602,9 +6612,8 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e if (WARN_ON_ONCE(start >= end)) continue; - flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap, - PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true, true, flush); + flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start, + end, true, flush); } } From 548f87f667a38ffeb2f021d9cfbc1f1b34fb4cb5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:43:25 -0700 Subject: [PATCH 133/352] KVM: x86/mmu: Honor NEED_RESCHED when zapping rmaps and blocking is allowed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert kvm_unmap_gfn_range(), which is the helper that zaps rmap SPTEs in response to an mmu_notifier invalidation, to use __kvm_rmap_zap_gfn_range() and feed in range->may_block. In other words, honor NEED_RESCHED by way of cond_resched() when zapping rmaps. This fixes a long-standing issue where KVM could process an absurd number of rmap entries without ever yielding, e.g. if an mmu_notifier fired on a PUD (or larger) range. Opportunistically rename __kvm_zap_rmap() to kvm_zap_rmap(), and drop the old kvm_zap_rmap(). Ideally, the shuffling would be done in a different patch, but that just makes the compiler unhappy, e.g. arch/x86/kvm/mmu/mmu.c:1462:13: error: ‘kvm_zap_rmap’ defined but not used Reported-by: Peter Xu Link: https://lore.kernel.org/r/20240809194335.1726916-14-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 70b043d7701dd..27a8a4f486c5c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1435,16 +1435,10 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } -static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot) -{ - return kvm_zap_all_rmap_sptes(kvm, rmap_head); -} - static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level) + const struct kvm_memory_slot *slot) { - return __kvm_zap_rmap(kvm, rmap_head, slot); + return kvm_zap_all_rmap_sptes(kvm, rmap_head); } struct slot_rmap_walk_iterator { @@ -1578,7 +1572,7 @@ static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, bool can_yield, bool flush) { - return __walk_slot_rmaps(kvm, slot, __kvm_zap_rmap, + return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, can_yield, true, flush); } @@ -1607,7 +1601,9 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool flush = false; if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); + flush = __kvm_rmap_zap_gfn_range(kvm, range->slot, + range->start, range->end, + range->may_block, flush); if (tdp_mmu_enabled) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); From c17f150000f6b06061dc109bc2dd2858898a62b2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:43:26 -0700 Subject: [PATCH 134/352] KVM: x86/mmu: Morph kvm_handle_gfn_range() into an aging specific helper Rework kvm_handle_gfn_range() into an aging-specic helper, kvm_rmap_age_gfn_range(). In addition to purging a bunch of unnecessary boilerplate code, this sets the stage for aging rmap SPTEs outside of mmu_lock. Note, there's a small functional change, as kvm_test_age_gfn() will now return immediately if a young SPTE is found, whereas previously KVM would continue iterating over other levels. Link: https://lore.kernel.org/r/20240809194335.1726916-15-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 68 ++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 46 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 27a8a4f486c5c..9b977560677b9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1577,25 +1577,6 @@ static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm, start, end - 1, can_yield, true, flush); } -typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level); - -static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, - struct kvm_gfn_range *range, - rmap_handler_t handler) -{ - struct slot_rmap_walk_iterator iterator; - bool ret = false; - - for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - range->start, range->end - 1, &iterator) - ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, - iterator.level); - - return ret; -} - bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { bool flush = false; @@ -1615,31 +1596,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) return flush; } -static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level) -{ - u64 *sptep; - struct rmap_iterator iter; - int young = 0; - - for_each_rmap_spte(rmap_head, &iter, sptep) - young |= mmu_spte_age(sptep); - - return young; -} - -static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level) -{ - u64 *sptep; - struct rmap_iterator iter; - - for_each_rmap_spte(rmap_head, &iter, sptep) - if (is_accessed_spte(*sptep)) - return true; - return false; -} - #define RMAP_RECYCLE_THRESHOLD 1000 static void __rmap_add(struct kvm *kvm, @@ -1674,12 +1630,32 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); } +static bool kvm_rmap_age_gfn_range(struct kvm *kvm, + struct kvm_gfn_range *range, bool test_only) +{ + struct slot_rmap_walk_iterator iterator; + struct rmap_iterator iter; + bool young = false; + u64 *sptep; + + for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, + range->start, range->end - 1, &iterator) { + for_each_rmap_spte(iterator.rmap, &iter, sptep) { + if (test_only && is_accessed_spte(*sptep)) + return true; + + young = mmu_spte_age(sptep); + } + } + return young; +} + bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); + young = kvm_rmap_age_gfn_range(kvm, range, false); if (tdp_mmu_enabled) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); @@ -1692,7 +1668,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool young = false; if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); + young = kvm_rmap_age_gfn_range(kvm, range, true); if (tdp_mmu_enabled) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); From 7aac9dc680da9390a22515c3f822c9d1907c4f02 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:43:27 -0700 Subject: [PATCH 135/352] KVM: x86/mmu: Fold mmu_spte_age() into kvm_rmap_age_gfn_range() Fold mmu_spte_age() into its sole caller now that aging and testing for young SPTEs is handled in a common location, i.e. doesn't require more helpers. Opportunistically remove the use of mmu_spte_get_lockless(), as mmu_lock is held (for write!), and marking SPTEs for access tracking outside of mmu_lock is unsafe (at least, as written). I.e. using the lockless accessor is quite misleading. No functional change intended. Link: https://lore.kernel.org/r/20240809194335.1726916-16-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 50 +++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9b977560677b9..24ca233d78cd8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -614,32 +614,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } -/* Returns the Accessed status of the PTE and resets it at the same time. */ -static bool mmu_spte_age(u64 *sptep) -{ - u64 spte = mmu_spte_get_lockless(sptep); - - if (!is_accessed_spte(spte)) - return false; - - if (spte_ad_enabled(spte)) { - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); - } else { - /* - * Capture the dirty status of the page, so that it doesn't get - * lost when the SPTE is marked for access tracking. - */ - if (is_writable_pte(spte)) - kvm_set_pfn_dirty(spte_to_pfn(spte)); - - spte = mark_spte_for_access_track(spte); - mmu_spte_update_no_track(sptep, spte); - } - - return true; -} - static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) { return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; @@ -1641,10 +1615,30 @@ static bool kvm_rmap_age_gfn_range(struct kvm *kvm, for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, range->start, range->end - 1, &iterator) { for_each_rmap_spte(iterator.rmap, &iter, sptep) { - if (test_only && is_accessed_spte(*sptep)) + u64 spte = *sptep; + + if (!is_accessed_spte(spte)) + continue; + + if (test_only) return true; - young = mmu_spte_age(sptep); + if (spte_ad_enabled(spte)) { + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + } else { + /* + * Capture the dirty status of the page, so that + * it doesn't get lost when the SPTE is marked + * for access tracking. + */ + if (is_writable_pte(spte)) + kvm_set_pfn_dirty(spte_to_pfn(spte)); + + spte = mark_spte_for_access_track(spte); + mmu_spte_update_no_track(sptep, spte); + } + young = true; } } return young; From 7645829145a91a3e13fdc322492500dae46ca17c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:43:28 -0700 Subject: [PATCH 136/352] KVM: x86/mmu: Add KVM_RMAP_MANY to replace open coded '1' and '1ul' literals Replace all of the open coded '1' literals used to mark a PTE list as having many/multiple entries with a proper define. It's hard enough to read the code with one magic bit, and a future patch to support "locking" a single rmap will add another. No functional change intended. Link: https://lore.kernel.org/r/20240809194335.1726916-17-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 24ca233d78cd8..53e24892055ad 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -912,6 +912,7 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct * pte_list_desc containing more mappings. */ +#define KVM_RMAP_MANY BIT(0) /* * Returns the number of pointers in the rmap chain, not counting the new one. @@ -924,16 +925,16 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, if (!rmap_head->val) { rmap_head->val = (unsigned long)spte; - } else if (!(rmap_head->val & 1)) { + } else if (!(rmap_head->val & KVM_RMAP_MANY)) { desc = kvm_mmu_memory_cache_alloc(cache); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; desc->spte_count = 2; desc->tail_count = 0; - rmap_head->val = (unsigned long)desc | 1; + rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; ++count; } else { - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); count = desc->tail_count + desc->spte_count; /* @@ -942,10 +943,10 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, */ if (desc->spte_count == PTE_LIST_EXT) { desc = kvm_mmu_memory_cache_alloc(cache); - desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); desc->spte_count = 0; desc->tail_count = count; - rmap_head->val = (unsigned long)desc | 1; + rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; } desc->sptes[desc->spte_count++] = spte; } @@ -956,7 +957,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct pte_list_desc *desc, int i) { - struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); int j = head_desc->spte_count - 1; /* @@ -985,7 +986,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm, if (!head_desc->more) rmap_head->val = 0; else - rmap_head->val = (unsigned long)head_desc->more | 1; + rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY; mmu_free_pte_list_desc(head_desc); } @@ -998,13 +999,13 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte, if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) return; - if (!(rmap_head->val & 1)) { + if (!(rmap_head->val & KVM_RMAP_MANY)) { if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) return; rmap_head->val = 0; } else { - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { @@ -1037,12 +1038,12 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, if (!rmap_head->val) return false; - if (!(rmap_head->val & 1)) { + if (!(rmap_head->val & KVM_RMAP_MANY)) { mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); goto out; } - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); for (; desc; desc = next) { for (i = 0; i < desc->spte_count; i++) @@ -1062,10 +1063,10 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) if (!rmap_head->val) return 0; - else if (!(rmap_head->val & 1)) + else if (!(rmap_head->val & KVM_RMAP_MANY)) return 1; - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); return desc->tail_count + desc->spte_count; } @@ -1127,13 +1128,13 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, if (!rmap_head->val) return NULL; - if (!(rmap_head->val & 1)) { + if (!(rmap_head->val & KVM_RMAP_MANY)) { iter->desc = NULL; sptep = (u64 *)rmap_head->val; goto out; } - iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); iter->pos = 0; sptep = iter->desc->sptes[iter->pos]; out: From 9a5bff7f5ec2383e3edac5eda561b52e267ccbb5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 9 Aug 2024 12:43:30 -0700 Subject: [PATCH 137/352] KVM: x86/mmu: Use KVM_PAGES_PER_HPAGE() instead of an open coded equivalent Use KVM_PAGES_PER_HPAGE() instead of open coding equivalent logic that is anything but obvious. No functional change intended, and verified by compiling with the below assertions: BUILD_BUG_ON((1UL << KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K)) != KVM_PAGES_PER_HPAGE(PG_LEVEL_4K)); BUILD_BUG_ON((1UL << KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) != KVM_PAGES_PER_HPAGE(PG_LEVEL_2M)); BUILD_BUG_ON((1UL << KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) != KVM_PAGES_PER_HPAGE(PG_LEVEL_1G)); Link: https://lore.kernel.org/r/20240809194335.1726916-19-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 53e24892055ad..b751e7e2a05e4 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1464,7 +1464,7 @@ static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) { while (++iterator->rmap <= iterator->end_rmap) { - iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); + iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level); if (iterator->rmap->val) return; From f3009482512eb057e7161214a068c6bd7bae83a4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:16 -0700 Subject: [PATCH 138/352] KVM: VMX: Set PFERR_GUEST_{FINAL,PAGE}_MASK if and only if the GVA is valid MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set PFERR_GUEST_{FINAL,PAGE}_MASK based on EPT_VIOLATION_GVA_TRANSLATED if and only if EPT_VIOLATION_GVA_IS_VALID is also set in exit qualification. Per the SDM, bit 8 (EPT_VIOLATION_GVA_TRANSLATED) is valid if and only if bit 7 (EPT_VIOLATION_GVA_IS_VALID) is set, and is '0' if bit 7 is '0'. Bit 7 (a.k.a. EPT_VIOLATION_GVA_IS_VALID) Set if the guest linear-address field is valid. The guest linear-address field is valid for all EPT violations except those resulting from an attempt to load the guest PDPTEs as part of the execution of the MOV CR instruction and those due to trace-address pre-translation Bit 8 (a.k.a. EPT_VIOLATION_GVA_TRANSLATED) If bit 7 is 1: • Set if the access causing the EPT violation is to a guest-physical address that is the translation of a linear address. • Clear if the access causing the EPT violation is to a paging-structure entry as part of a page walk or the update of an accessed or dirty bit. Reserved if bit 7 is 0 (cleared to 0). Failure to guard the logic on GVA_IS_VALID results in KVM marking the page fault as PFERR_GUEST_PAGE_MASK when there is no known GVA, which can put the vCPU into an infinite loop due to kvm_mmu_page_fault() getting false positive on its PFERR_NESTED_GUEST_PAGE logic (though only because that logic is also buggy/flawed). In practice, this is largely a non-issue because so GVA_IS_VALID is almost always set. However, when TDX comes along, GVA_IS_VALID will *never* be set, as the TDX Module deliberately clears bits 12:7 in exit qualification, e.g. so that the faulting virtual address and other metadata that aren't practically useful for the hypervisor aren't leaked to the untrusted host. When exit is due to EPT violation, bits 12-7 of the exit qualification are cleared to 0. Fixes: eebed2438923 ("kvm: nVMX: Add support for fast unprotection of nested guest page tables") Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240831001538.336683-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fe99deceebbd1..ec1aee1f9057e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5807,8 +5807,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) ? PFERR_PRESENT_MASK : 0; - error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ? - PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; + if (error_code & EPT_VIOLATION_GVA_IS_VALID) + error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; /* * Check that the GPA doesn't exceed physical memory limits, as that is From 1d7f856c2ca449f04a22d876e36b464b7a9d28b6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Sep 2024 12:50:09 +0200 Subject: [PATCH 139/352] jump_label: Fix static_key_slow_dec() yet again While commit 83ab38ef0a0b ("jump_label: Fix concurrency issues in static_key_slow_dec()") fixed one problem, it created yet another, notably the following is now possible: slow_dec if (try_dec) // dec_not_one-ish, false // enabled == 1 slow_inc if (inc_not_disabled) // inc_not_zero-ish // enabled == 2 return guard((mutex)(&jump_label_mutex); if (atomic_cmpxchg(1,0)==1) // false, we're 2 slow_dec if (try-dec) // dec_not_one, true // enabled == 1 return else try_dec() // dec_not_one, false WARN Use dec_and_test instead of cmpxchg(), like it was prior to 83ab38ef0a0b. Add a few WARNs for the paranoid. Fixes: 83ab38ef0a0b ("jump_label: Fix concurrency issues in static_key_slow_dec()") Reported-by: "Darrick J. Wong" Tested-by: Klara Modin Signed-off-by: Peter Zijlstra (Intel) --- kernel/jump_label.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 6dc76b590703e..93a822d3c468c 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -168,7 +168,7 @@ bool static_key_slow_inc_cpuslocked(struct static_key *key) jump_label_update(key); /* * Ensure that when static_key_fast_inc_not_disabled() or - * static_key_slow_try_dec() observe the positive value, + * static_key_dec_not_one() observe the positive value, * they must also observe all the text changes. */ atomic_set_release(&key->enabled, 1); @@ -250,7 +250,7 @@ void static_key_disable(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_disable); -static bool static_key_slow_try_dec(struct static_key *key) +static bool static_key_dec_not_one(struct static_key *key) { int v; @@ -274,6 +274,14 @@ static bool static_key_slow_try_dec(struct static_key *key) * enabled. This suggests an ordering problem on the user side. */ WARN_ON_ONCE(v < 0); + + /* + * Warn about underflow, and lie about success in an attempt to + * not make things worse. + */ + if (WARN_ON_ONCE(v == 0)) + return true; + if (v <= 1) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v - 1))); @@ -284,15 +292,27 @@ static bool static_key_slow_try_dec(struct static_key *key) static void __static_key_slow_dec_cpuslocked(struct static_key *key) { lockdep_assert_cpus_held(); + int val; - if (static_key_slow_try_dec(key)) + if (static_key_dec_not_one(key)) return; guard(mutex)(&jump_label_mutex); - if (atomic_cmpxchg(&key->enabled, 1, 0) == 1) + val = atomic_read(&key->enabled); + /* + * It should be impossible to observe -1 with jump_label_mutex held, + * see static_key_slow_inc_cpuslocked(). + */ + if (WARN_ON_ONCE(val == -1)) + return; + /* + * Cannot already be 0, something went sideways. + */ + if (WARN_ON_ONCE(val == 0)) + return; + + if (atomic_dec_and_test(&key->enabled)) jump_label_update(key); - else - WARN_ON_ONCE(!static_key_slow_try_dec(key)); } static void __static_key_slow_dec(struct static_key *key) @@ -329,7 +349,7 @@ void __static_key_slow_dec_deferred(struct static_key *key, { STATIC_KEY_CHECK_USE(key); - if (static_key_slow_try_dec(key)) + if (static_key_dec_not_one(key)) return; schedule_delayed_work(work, timeout); From d00b83d416e73bc3fa4d21b14bec920e88b70ce6 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 9 Sep 2024 14:29:05 -0400 Subject: [PATCH 140/352] locking/rwsem: Move is_rwsem_reader_owned() and rwsem_owner() under CONFIG_DEBUG_RWSEMS Both is_rwsem_reader_owned() and rwsem_owner() are currently only used when CONFIG_DEBUG_RWSEMS is defined. This causes a compilation error with clang when `make W=1` and CONFIG_WERROR=y: kernel/locking/rwsem.c:187:20: error: unused function 'is_rwsem_reader_owned' [-Werror,-Wunused-function] 187 | static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) | ^~~~~~~~~~~~~~~~~~~~~ kernel/locking/rwsem.c:271:35: error: unused function 'rwsem_owner' [-Werror,-Wunused-function] 271 | static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) | ^~~~~~~~~~~ Fix this by moving these two functions under the CONFIG_DEBUG_RWSEMS define. Reported-by: Andy Shevchenko Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andy Shevchenko Tested-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240909182905.161156-1-longman@redhat.com --- kernel/locking/rwsem.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 33cac79e39946..4b041e9c408fe 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -181,12 +181,21 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) __rwsem_set_reader_owned(sem, current); } +#ifdef CONFIG_DEBUG_RWSEMS +/* + * Return just the real task structure pointer of the owner + */ +static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) +{ + return (struct task_struct *) + (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); +} + /* * Return true if the rwsem is owned by a reader. */ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) { -#ifdef CONFIG_DEBUG_RWSEMS /* * Check the count to see if it is write-locked. */ @@ -194,11 +203,9 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) if (count & RWSEM_WRITER_MASK) return false; -#endif return rwsem_test_oflags(sem, RWSEM_READER_OWNED); } -#ifdef CONFIG_DEBUG_RWSEMS /* * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there * is a task pointer in owner of a reader-owned rwsem, it will be the @@ -265,15 +272,6 @@ static inline bool rwsem_write_trylock(struct rw_semaphore *sem) return false; } -/* - * Return just the real task structure pointer of the owner - */ -static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) -{ - return (struct task_struct *) - (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); -} - /* * Return the real task structure pointer of the owner and the embedded * flags in the owner. pflags must be non-NULL. From 4440337af4d415c8abf8b9b0e10c79b7518e6e3c Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Wed, 7 Aug 2024 14:35:31 +0200 Subject: [PATCH 141/352] KVM: SVM: let alternatives handle the cases when RSB filling is required Remove superfluous RSB filling after a VMEXIT when the CPU already has flushed the RSB after a VMEXIT when AutoIBRS is enabled. The initial implementation for adding RETPOLINES added an ALTERNATIVES implementation for filling the RSB after a VMEXIT in commit 117cc7a908c8 ("x86/retpoline: Fill return stack buffer on vmexit"). Later, X86_FEATURE_RSB_VMEXIT was added in commit 9756bba28470 ("x86/speculation: Fill RSB on vmexit for IBRS") to handle stuffing the RSB if RETPOLINE=y *or* KERNEL_IBRS=y, i.e. to also stuff the RSB if the kernel is configured to do IBRS mitigations on entry/exit. The AutoIBRS (on AMD) feature implementation added in commit e7862eda309e ("x86/cpu: Support AMD Automatic IBRS") used the already-implemented logic for EIBRS in spectre_v2_determine_rsb_fill_type_on_vmexit() -- but did not update the code at VMEXIT to act on the mode selected in that function -- resulting in VMEXITs continuing to clear the RSB when RETPOLINES are enabled, despite the presence of AutoIBRS. Signed-off-by: Amit Shah Link: https://lore.kernel.org/r/20240807123531.69677-1-amit@kernel.org [sean: massage changeloge, drop comment about AMD not needing RSB_VMEXIT_LITE] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/vmenter.S | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index a0c8eb37d3e1c..2ed80aea3bb13 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -209,10 +209,8 @@ SYM_FUNC_START(__svm_vcpu_run) 7: vmload %_ASM_AX 8: -#ifdef CONFIG_MITIGATION_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ - FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE -#endif + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT /* Clobbers RAX, RCX, RDX. */ RESTORE_HOST_SPEC_CTRL @@ -348,10 +346,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) 2: cli -#ifdef CONFIG_MITIGATION_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ - FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE -#endif + FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT /* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@spec_ctrl_intercepted). */ RESTORE_HOST_SPEC_CTRL From 3d882cca73be830549833517ddccb3ac4668c04e Mon Sep 17 00:00:00 2001 From: Rafael Rocha Date: Thu, 5 Sep 2024 12:39:21 -0500 Subject: [PATCH 142/352] scsi: st: Fix input/output error on empty drive reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A previous change was introduced to prevent data loss during a power-on reset when a tape is present inside the drive. This commit set the "pos_unknown" flag to true to avoid operations that could compromise data by performing actions from an untracked position. The relevant change is commit 9604eea5bd3a ("scsi: st: Add third party poweron reset handling") As a consequence of this change, a new issue has surfaced: the driver now returns an "Input/output error" even for empty drives when the drive, host, or bus is reset. This issue stems from the "flush_buffer" function, which first checks whether the "pos_unknown" flag is set. If the flag is set, the user will encounter an "Input/output error" until the tape position is known again. This behavior differs from the previous implementation, where empty drives were not affected at system start up time, allowing tape software to send commands to the driver to retrieve the drive's status and other information. The current behavior prioritizes the "pos_unknown" flag over the "ST_NO_TAPE" status, leading to issues for software that detects drives during system startup. This software will receive an "Input/output error" until a tape is loaded and its position is known. To resolve this, the "ST_NO_TAPE" status should take priority when the drive is empty, allowing communication with the drive following a power-on reset. At the same time, the change should continue to protect data by maintaining the "pos_unknown" flag when the drive contains a tape and its position is unknown. Signed-off-by: Rafael Rocha Link: https://lore.kernel.org/r/20240905173921.10944-1-rrochavi@fnal.gov Fixes: 9604eea5bd3a ("scsi: st: Add third party poweron reset handling") Acked-by: Kai Mäkisara Signed-off-by: Martin K. Petersen --- drivers/scsi/st.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 0d8ce1a92168c..d50bad3a2ce92 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -834,6 +834,9 @@ static int flush_buffer(struct scsi_tape *STp, int seek_next) int backspace, result; struct st_partstat *STps; + if (STp->ready != ST_READY) + return 0; + /* * If there was a bus reset, block further access * to this device. @@ -841,8 +844,6 @@ static int flush_buffer(struct scsi_tape *STp, int seek_next) if (STp->pos_unknown) return (-EIO); - if (STp->ready != ST_READY) - return 0; STps = &(STp->ps[STp->partition]); if (STps->rw == ST_WRITING) /* Writing */ return st_flush_write_buffer(STp); From b112947ffc30e9632d5c2acd0e9081e3e6bee01e Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Fri, 30 Aug 2024 15:58:58 +0800 Subject: [PATCH 143/352] scsi: sd: Remove duplicate included header file linux/bio-integrity.h The header file linux/bio-integrity.h is included twice. Remove the last one. The compilation test has passed. Signed-off-by: Hongbo Li Link: https://lore.kernel.org/r/20240830075858.3541907-1-lihongbo22@huawei.com Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index e90a2e4ab3182..b0e565402f6aa 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include From e3684006945414153a33c5c4d1202dda2b80650f Mon Sep 17 00:00:00 2001 From: Brian King Date: Tue, 3 Sep 2024 08:47:09 -0500 Subject: [PATCH 144/352] scsi: ibmvfc: Add max_sectors module parameter There are some scenarios that can occur, such as performing an upgrade of the virtual I/O server, where the supported max transfer of the backing device for an ibmvfc HBA can change. If the max transfer of the backing device decreases, this can cause issues with previously discovered LUNs. This patch accomplishes two things. First, it changes the default ibmvfc max transfer value to 1MB. This is generally supported by all backing devices, which should mitigate this issue out of the box. Secondly, it adds a module parameter, enabling a user to increase the max transfer value to values that are larger than 1MB, as long as they have configured these larger values on the virtual I/O server as well. [mkp: fix checkpatch warnings] Signed-off-by: Brian King Link: https://lore.kernel.org/r/20240903134708.139645-2-brking@linux.ibm.com Reviewed-by: Martin Wilck Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/ibmvscsi/ibmvfc.c | 21 ++++++++++++++++++--- drivers/scsi/ibmvscsi/ibmvfc.h | 2 +- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index a3d1013c83075..e66c3ef74267a 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -37,6 +37,7 @@ static unsigned int default_timeout = IBMVFC_DEFAULT_TIMEOUT; static u64 max_lun = IBMVFC_MAX_LUN; static unsigned int max_targets = IBMVFC_MAX_TARGETS; static unsigned int max_requests = IBMVFC_MAX_REQUESTS_DEFAULT; +static u16 max_sectors = IBMVFC_MAX_SECTORS; static u16 scsi_qdepth = IBMVFC_SCSI_QDEPTH; static unsigned int disc_threads = IBMVFC_MAX_DISC_THREADS; static unsigned int ibmvfc_debug = IBMVFC_DEBUG; @@ -83,6 +84,9 @@ MODULE_PARM_DESC(default_timeout, module_param_named(max_requests, max_requests, uint, S_IRUGO); MODULE_PARM_DESC(max_requests, "Maximum requests for this adapter. " "[Default=" __stringify(IBMVFC_MAX_REQUESTS_DEFAULT) "]"); +module_param_named(max_sectors, max_sectors, ushort, S_IRUGO); +MODULE_PARM_DESC(max_sectors, "Maximum sectors for this adapter. " + "[Default=" __stringify(IBMVFC_MAX_SECTORS) "]"); module_param_named(scsi_qdepth, scsi_qdepth, ushort, S_IRUGO); MODULE_PARM_DESC(scsi_qdepth, "Maximum scsi command depth per adapter queue. " "[Default=" __stringify(IBMVFC_SCSI_QDEPTH) "]"); @@ -1494,7 +1498,7 @@ static void ibmvfc_set_login_info(struct ibmvfc_host *vhost) memset(login_info, 0, sizeof(*login_info)); login_info->ostype = cpu_to_be32(IBMVFC_OS_LINUX); - login_info->max_dma_len = cpu_to_be64(IBMVFC_MAX_SECTORS << 9); + login_info->max_dma_len = cpu_to_be64(max_sectors << 9); login_info->max_payload = cpu_to_be32(sizeof(struct ibmvfc_fcp_cmd_iu)); login_info->max_response = cpu_to_be32(sizeof(struct ibmvfc_fcp_rsp)); login_info->partition_num = cpu_to_be32(vhost->partition_number); @@ -5230,7 +5234,7 @@ static void ibmvfc_npiv_login_done(struct ibmvfc_event *evt) } vhost->logged_in = 1; - npiv_max_sectors = min((uint)(be64_to_cpu(rsp->max_dma_len) >> 9), IBMVFC_MAX_SECTORS); + npiv_max_sectors = min((uint)(be64_to_cpu(rsp->max_dma_len) >> 9), max_sectors); dev_info(vhost->dev, "Host partition: %s, device: %s %s %s max sectors %u\n", rsp->partition_name, rsp->device_name, rsp->port_loc_code, rsp->drc_name, npiv_max_sectors); @@ -6329,7 +6333,7 @@ static int ibmvfc_probe(struct vio_dev *vdev, const struct vio_device_id *id) shost->can_queue = scsi_qdepth; shost->max_lun = max_lun; shost->max_id = max_targets; - shost->max_sectors = IBMVFC_MAX_SECTORS; + shost->max_sectors = max_sectors; shost->max_cmd_len = IBMVFC_MAX_CDB_LEN; shost->unique_id = shost->host_no; shost->nr_hw_queues = mq_enabled ? min(max_scsi_queues, nr_scsi_hw_queues) : 1; @@ -6556,6 +6560,7 @@ static struct fc_function_template ibmvfc_transport_functions = { **/ static int __init ibmvfc_module_init(void) { + int min_max_sectors = PAGE_SIZE >> 9; int rc; if (!firmware_has_feature(FW_FEATURE_VIO)) @@ -6564,6 +6569,16 @@ static int __init ibmvfc_module_init(void) printk(KERN_INFO IBMVFC_NAME": IBM Virtual Fibre Channel Driver version: %s %s\n", IBMVFC_DRIVER_VERSION, IBMVFC_DRIVER_DATE); + /* + * Range check the max_sectors module parameter. The upper bounds is + * implicity checked since the parameter is a ushort. + */ + if (max_sectors < min_max_sectors) { + printk(KERN_ERR IBMVFC_NAME ": max_sectors must be at least %d.\n", + min_max_sectors); + max_sectors = min_max_sectors; + } + ibmvfc_transport_template = fc_attach_transport(&ibmvfc_transport_functions); if (!ibmvfc_transport_template) return -ENOMEM; diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h index 745ad5ac72517..c73ed2314ad04 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.h +++ b/drivers/scsi/ibmvscsi/ibmvfc.h @@ -32,7 +32,7 @@ #define IBMVFC_DEBUG 0 #define IBMVFC_MAX_TARGETS 1024 #define IBMVFC_MAX_LUN 0xffffffff -#define IBMVFC_MAX_SECTORS 0xffffu +#define IBMVFC_MAX_SECTORS 2048 #define IBMVFC_MAX_DISC_THREADS 4 #define IBMVFC_TGT_MEMPOOL_SZ 64 #define IBMVFC_MAX_CMDS_PER_LUN 64 From 45fad027df61f848fa9e036e8de6ba009cd1a134 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 1 Sep 2024 22:45:27 +0200 Subject: [PATCH 145/352] scsi: libcxgbi: Remove an unused field in struct cxgbi_device Usage of .dev_ddp_cleanup() in libcxgbi was removed by commit 5999299f1ce9 ("cxgb3i,cxgb4i,libcxgbi: remove iSCSI DDP support") on 2016-07. .csk_rx_pdu_ready() and debugfs_root have apparently never been used since introduction by commit 9ba682f01e2f ("[SCSI] libcxgbi: common library for cxgb3i and cxgb4i") Remove the now unused function pointer from struct cxgbi_device. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/58f77f690d85e2c653447e3e3fc4f8d3c3ce8563.1725223504.git.christophe.jaillet@wanadoo.fr Signed-off-by: Martin K. Petersen --- drivers/scsi/cxgbi/libcxgbi.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/scsi/cxgbi/libcxgbi.h b/drivers/scsi/cxgbi/libcxgbi.h index d92cf1dccc2f9..0909b03e24976 100644 --- a/drivers/scsi/cxgbi/libcxgbi.h +++ b/drivers/scsi/cxgbi/libcxgbi.h @@ -485,7 +485,6 @@ struct cxgbi_device { unsigned char nmtus; unsigned char nports; struct pci_dev *pdev; - struct dentry *debugfs_root; struct iscsi_transport *itp; struct module *owner; @@ -499,7 +498,6 @@ struct cxgbi_device { unsigned int rxq_idx_cntr; struct cxgbi_ports_map pmap; - void (*dev_ddp_cleanup)(struct cxgbi_device *); struct cxgbi_ppm* (*cdev2ppm)(struct cxgbi_device *); int (*csk_ddp_set_map)(struct cxgbi_ppm *, struct cxgbi_sock *, struct cxgbi_task_tag_info *); @@ -512,7 +510,6 @@ struct cxgbi_device { unsigned int, int); void (*csk_release_offload_resources)(struct cxgbi_sock *); - int (*csk_rx_pdu_ready)(struct cxgbi_sock *, struct sk_buff *); u32 (*csk_send_rx_credits)(struct cxgbi_sock *, u32); int (*csk_push_tx_frames)(struct cxgbi_sock *, int); void (*csk_send_abort_req)(struct cxgbi_sock *); From e88ed594328900959f8aae72c2e6240703a91f33 Mon Sep 17 00:00:00 2001 From: Yan Zhen Date: Mon, 2 Sep 2024 09:33:03 +0800 Subject: [PATCH 146/352] scsi: fusion: mptctl: Use min() macro Using the real macro is usually more intuitive and readable when the original file is guaranteed to contain the minmax.h header file and compile correctly. Signed-off-by: Yan Zhen Link: https://lore.kernel.org/r/20240902013303.909316-1-yanzhen@vivo.com Signed-off-by: Martin K. Petersen --- drivers/message/fusion/mptctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/message/fusion/mptctl.c b/drivers/message/fusion/mptctl.c index 9f3999750c23a..087397ccca594 100644 --- a/drivers/message/fusion/mptctl.c +++ b/drivers/message/fusion/mptctl.c @@ -1609,7 +1609,7 @@ mptctl_eventreport (MPT_ADAPTER *ioc, unsigned long arg) maxEvents = numBytes/sizeof(MPT_IOCTL_EVENTS); - max = MPTCTL_EVENT_LOG_SIZE < maxEvents ? MPTCTL_EVENT_LOG_SIZE : maxEvents; + max = min(maxEvents, MPTCTL_EVENT_LOG_SIZE); /* If fewer than 1 event is requested, there must have * been some type of error. From 57bada8a5e69e6cb0668436b206db1fbdbf7ebfd Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Sep 2024 15:12:02 +0100 Subject: [PATCH 147/352] scsi: zalon: Remove trailing space after \n newline There is a extraneous space after a newline in a dev_printk message, remove it. Also fix non-tabbed indentation of the statement. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20240902141202.308632-1-colin.i.king@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/zalon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/zalon.c b/drivers/scsi/zalon.c index 22d412cab91da..15602ec862e30 100644 --- a/drivers/scsi/zalon.c +++ b/drivers/scsi/zalon.c @@ -139,7 +139,7 @@ zalon_probe(struct parisc_device *dev) return -ENODEV; if (request_irq(dev->irq, ncr53c8xx_intr, IRQF_SHARED, "zalon", host)) { - dev_printk(KERN_ERR, &dev->dev, "irq problem with %d, detaching\n ", + dev_printk(KERN_ERR, &dev->dev, "irq problem with %d, detaching\n", dev->irq); goto fail; } From 34f04a9b6e39d0085df5a2a5e6e4e1878d808132 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Sep 2024 15:15:37 +0100 Subject: [PATCH 148/352] scsi: pm8001: Remove trailing space after \n newline There is a extraneous space after a newline in a pm8001_dbg message. Remove it. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20240902141537.308914-1-colin.i.king@gmail.com Acked-by: Jack Wang Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm80xx_hwi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index 8fe886dc5e476..a9869cd8c4c07 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -2037,7 +2037,7 @@ mpi_ssp_completion(struct pm8001_hba_info *pm8001_ha, void *piomb) atomic_dec(&pm8001_dev->running_req); break; } - pm8001_dbg(pm8001_ha, IO, "scsi_status = 0x%x\n ", + pm8001_dbg(pm8001_ha, IO, "scsi_status = 0x%x\n", psspPayload->ssp_resp_iu.status); spin_lock_irqsave(&t->task_state_lock, flags); t->task_state_flags &= ~SAS_TASK_STATE_PENDING; From 571d81b482f00dfe8912ecbc3de090f99181ee7a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Sep 2024 15:22:52 +0100 Subject: [PATCH 149/352] scsi: megaraid_sas: Remove trailing space after \n newline There is a extraneous space after a newline in a dev_err message. Remove it. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20240902142252.309232-1-colin.i.king@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 6c79c350a4d5b..4ecf5284c0fc3 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -6380,7 +6380,7 @@ static int megasas_init_fw(struct megasas_instance *instance) GFP_KERNEL); if (!fusion->stream_detect_by_ld[i]) { dev_err(&instance->pdev->dev, - "unable to allocate stream detect by LD\n "); + "unable to allocate stream detect by LD\n"); for (j = 0; j < i; ++j) kfree(fusion->stream_detect_by_ld[j]); kfree(fusion->stream_detect_by_ld); From d2ce0e5ab505ddaf153c9c20b4f627f0ed034a1e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Sep 2024 15:41:53 +0100 Subject: [PATCH 150/352] scsi: hisi_sas: Remove trailing space after \n newline There is a extraneous space after a newline in a dev_info message. Remove it. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20240902144153.309920-1-colin.i.king@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index feda9b54b4434..4cd3a3eab6f1c 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -2421,7 +2421,7 @@ static void slot_complete_v3_hw(struct hisi_hba *hisi_hba, spin_lock_irqsave(&device->done_lock, flags); if (test_bit(SAS_HA_FROZEN, &ha->state)) { spin_unlock_irqrestore(&device->done_lock, flags); - dev_info(dev, "slot complete: task(%pK) ignored\n ", + dev_info(dev, "slot complete: task(%pK) ignored\n", task); return; } From fa557da6b05034538ea4ecbf55bac4c23d391e2d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Sep 2024 15:51:38 +0100 Subject: [PATCH 151/352] scsi: qedf: Remove trailing space after \n newline There is a extraneous space after a newline in a QEDF_INFO message. Remove it. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20240902145138.310883-1-colin.i.king@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/qedf/qedf_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qedf/qedf_io.c b/drivers/scsi/qedf/qedf_io.c index 054a51713d556..fcfc3bed02c61 100644 --- a/drivers/scsi/qedf/qedf_io.c +++ b/drivers/scsi/qedf/qedf_io.c @@ -310,7 +310,7 @@ struct qedf_ioreq *qedf_alloc_cmd(struct qedf_rport *fcport, u8 cmd_type) if (!free_sqes) { QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO, - "Returning NULL, free_sqes=%d.\n ", + "Returning NULL, free_sqes=%d.\n", free_sqes); goto out_failed; } From c7c846fa94c9f71c4cfab3f15cffc5030cd01e39 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Sep 2024 16:00:42 +0100 Subject: [PATCH 152/352] scsi: lpfc: Remove trailing space after \n newline There is a extraneous space after a newline in two lpfc_printf_log() messages. Remove the space. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20240902150042.311157-1-colin.i.king@gmail.com Reviewed-by: Justin Tee Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_sli.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 332b8d2348e9e..0e60eebe53b5d 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -8818,7 +8818,7 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba) rc = lpfc_sli4_queue_setup(phba); if (unlikely(rc)) { lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT, - "0381 Error %d during queue setup.\n ", rc); + "0381 Error %d during queue setup.\n", rc); goto out_stop_timers; } /* Initialize the driver internal SLI layer lists. */ @@ -21149,7 +21149,7 @@ lpfc_drain_txq(struct lpfc_hba *phba) if (!piocbq) { spin_unlock_irqrestore(&pring->ring_lock, iflags); lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT, - "2823 txq empty and txq_cnt is %d\n ", + "2823 txq empty and txq_cnt is %d\n", txq_cnt); break; } From 0557f49870714c8c8cddfdc3c4b805aeae6c3e4e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Sep 2024 18:27:08 +0100 Subject: [PATCH 153/352] scsi: mpt3sas: Remove trailing space after \n newline There is a extraneous space after a newline in an ioc_info message. Remove it and join to split literal strings into one. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20240902172708.369741-1-colin.i.king@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_base.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 2d3eeda5a6a0d..fc013d7e3f2d6 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -8882,9 +8882,8 @@ _base_check_ioc_facts_changes(struct MPT3SAS_ADAPTER *ioc) ioc->device_remove_in_progress, pd_handles_sz, GFP_KERNEL); if (!device_remove_in_progress) { ioc_info(ioc, - "Unable to allocate the memory for " - "device_remove_in_progress of sz: %d\n " - , pd_handles_sz); + "Unable to allocate the memory for device_remove_in_progress of sz: %d\n", + pd_handles_sz); return -ENOMEM; } memset(device_remove_in_progress + From 0c40f079f1c808e7e480c795a79009f200366eb1 Mon Sep 17 00:00:00 2001 From: Manish Pandey Date: Tue, 3 Sep 2024 12:07:09 +0530 Subject: [PATCH 154/352] scsi: ufs: qcom: Update MODE_MAX cfg_bw value Commit 8db8f6ce556a ("scsi: ufs: qcom: Add missing interconnect bandwidth values for Gear 5") updated the ufs_qcom_bw_table for Gear 5. However, it missed updating the cfg_bw value for the max mode. Hence update the cfg_bw value for the max mode for UFS 4.x devices. Fixes: 8db8f6ce556a ("scsi: ufs: qcom: Add missing interconnect bandwidth values for Gear 5") Cc: stable@vger.kernel.org Signed-off-by: Manish Pandey Link: https://lore.kernel.org/r/20240903063709.4335-1-quic_mapa@quicinc.com Reviewed-by: Manivannan Sadhasivam Signed-off-by: Martin K. Petersen --- drivers/ufs/host/ufs-qcom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 810e637047d04..f994cea056170 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -93,7 +93,7 @@ static const struct __ufs_qcom_bw_table { [MODE_HS_RB][UFS_HS_G3][UFS_LANE_2] = { 1492582, 204800 }, [MODE_HS_RB][UFS_HS_G4][UFS_LANE_2] = { 2915200, 409600 }, [MODE_HS_RB][UFS_HS_G5][UFS_LANE_2] = { 5836800, 819200 }, - [MODE_MAX][0][0] = { 7643136, 307200 }, + [MODE_MAX][0][0] = { 7643136, 819200 }, }; static void ufs_qcom_get_default_testbus_cfg(struct ufs_qcom_host *host); From 24d7071d964574cca41fa72a10c211221af37aec Mon Sep 17 00:00:00 2001 From: Tomas Henzl Date: Tue, 3 Sep 2024 16:47:29 +0200 Subject: [PATCH 155/352] scsi: mpi3mr: A performance fix Commit 0c52310f2600 ("hrtimer: Ignore slack time for RT tasks in schedule_hrtimeout_range()") effectivelly shortens a sleep in a polling function in the driver. That is causing a performance regression as the new value of just 2us is too low, in certain tests the perf drop is ~30%. Fix this by adjusting the sleep to 20us (close to the previous value). Reported-by: Jan Jurca Signed-off-by: Tomas Henzl Acked-by: Sumit Saxena Link: https://lore.kernel.org/r/20240903144729.37218-1-thenzl@redhat.com Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr.h | 2 +- drivers/scsi/mpi3mr/mpi3mr_fw.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/mpi3mr/mpi3mr.h b/drivers/scsi/mpi3mr/mpi3mr.h index 1dc640de3efc9..0f9101a17ce9f 100644 --- a/drivers/scsi/mpi3mr/mpi3mr.h +++ b/drivers/scsi/mpi3mr/mpi3mr.h @@ -178,7 +178,7 @@ extern atomic64_t event_counter; #define MPI3MR_DEFAULT_SDEV_QD 32 /* Definitions for Threaded IRQ poll*/ -#define MPI3MR_IRQ_POLL_SLEEP 2 +#define MPI3MR_IRQ_POLL_SLEEP 20 #define MPI3MR_IRQ_POLL_TRIGGER_IOCOUNT 8 /* Definitions for the controller security status*/ diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c index 2e1a92d306b20..35ca54ba18e95 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_fw.c +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c @@ -728,7 +728,7 @@ static irqreturn_t mpi3mr_isr_poll(int irq, void *privdata) mpi3mr_process_op_reply_q(mrioc, intr_info->op_reply_q); - usleep_range(MPI3MR_IRQ_POLL_SLEEP, 10 * MPI3MR_IRQ_POLL_SLEEP); + usleep_range(MPI3MR_IRQ_POLL_SLEEP, MPI3MR_IRQ_POLL_SLEEP + 1); } while (atomic_read(&intr_info->op_reply_q->pend_ios) && (num_op_reply < mrioc->max_host_ios)); From a8598aefae31f3bf27bad1a4cebf2b04a4cdf220 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 4 Sep 2024 14:03:04 -0700 Subject: [PATCH 156/352] scsi: sd: Retry START STOP UNIT commands During system resume, sd_start_stop_device() submits a START STOP UNIT command to the SCSI device that is being resumed. That command is not retried in case of a unit attention and hence may fail. An example: [16575.983359] sd 0:0:0:3: [sdd] Starting disk [16575.983693] sd 0:0:0:3: [sdd] Start/Stop Unit failed: Result: hostbyte=0x00 driverbyte=DRIVER_OK [16575.983712] sd 0:0:0:3: [sdd] Sense Key : 0x6 [16575.983730] sd 0:0:0:3: [sdd] ASC=0x29 ASCQ=0x0 [16575.983738] sd 0:0:0:3: PM: dpm_run_callback(): scsi_bus_resume+0x0/0xa0 returns -5 [16575.983783] sd 0:0:0:3: PM: failed to resume async: error -5 Make the SCSI core retry the START STOP UNIT command if the device reports that it has been powered on or that it has been reset. Cc: Damien Le Moal Cc: Mike Christie Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240904210304.2947789-1-bvanassche@acm.org Reviewed-by: Damien Le Moal Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index b0e565402f6aa..43a723164fc53 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -4082,9 +4082,38 @@ static int sd_start_stop_device(struct scsi_disk *sdkp, int start) { unsigned char cmd[6] = { START_STOP }; /* START_VALID */ struct scsi_sense_hdr sshdr; + struct scsi_failure failure_defs[] = { + { + /* Power on, reset, or bus device reset occurred */ + .sense = UNIT_ATTENTION, + .asc = 0x29, + .ascq = 0, + .result = SAM_STAT_CHECK_CONDITION, + }, + { + /* Power on occurred */ + .sense = UNIT_ATTENTION, + .asc = 0x29, + .ascq = 1, + .result = SAM_STAT_CHECK_CONDITION, + }, + { + /* SCSI bus reset */ + .sense = UNIT_ATTENTION, + .asc = 0x29, + .ascq = 2, + .result = SAM_STAT_CHECK_CONDITION, + }, + {} + }; + struct scsi_failures failures = { + .total_allowed = 3, + .failure_definitions = failure_defs, + }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .req_flags = BLK_MQ_REQ_PM, + .failures = &failures, }; struct scsi_device *sdp = sdkp->device; int res; From 4708c9332d975aabc8498ddb85936631535fdc20 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Thu, 5 Sep 2024 10:35:21 +0800 Subject: [PATCH 157/352] scsi: pmcraid: Convert comma to semicolon Replace comma between expressions with semicolons. Using a ',' in place of a ';' can have unintended side effects. Although that is not the case here, it is seems best to use ';' unless ',' is intended. Found by inspection. No functional change intended. Compile tested only. Signed-off-by: Chen Ni Link: https://lore.kernel.org/r/20240905023521.1642862-1-nichen@iscas.ac.cn Signed-off-by: Martin K. Petersen --- drivers/scsi/pmcraid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c index a2a084c8075e0..8f7307c4d8762 100644 --- a/drivers/scsi/pmcraid.c +++ b/drivers/scsi/pmcraid.c @@ -1946,7 +1946,7 @@ static void pmcraid_soft_reset(struct pmcraid_cmd *cmd) } iowrite32(doorbell, pinstance->int_regs.host_ioa_interrupt_reg); - ioread32(pinstance->int_regs.host_ioa_interrupt_reg), + ioread32(pinstance->int_regs.host_ioa_interrupt_reg); int_reg = ioread32(pinstance->int_regs.ioa_host_interrupt_reg); pmcraid_info("Waiting for IOA to become operational %x:%x\n", From bba20b894e3c2e20f1ac914561b9ac241e0e359e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 7 Sep 2024 08:27:22 +0200 Subject: [PATCH 158/352] scsi: scsi_debug: Remove a useless memset() 'arr' is kzalloc()'ed, so there is no need to call memset(.., 0, ...) on it. It is already cleared. This is a follow up of commit b952eb270df3 ("scsi: scsi_debug: Allocate the MODE SENSE response from the heap"). Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/6296722174e39a51cac74b7fc68b0d75bd0db2a3.1725690433.git.christophe.jaillet@wanadoo.fr Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index a9d8a9c62663e..d95f417e24c0d 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -2760,7 +2760,6 @@ static int resp_mode_sense(struct scsi_cmnd *scp, else bd_len = 0; alloc_len = msense_6 ? cmd[4] : get_unaligned_be16(cmd + 7); - memset(arr, 0, SDEBUG_MAX_MSENSE_SZ); if (0x3 == pcontrol) { /* Saving values not supported */ mk_sense_buffer(scp, ILLEGAL_REQUEST, SAVING_PARAMS_UNSUP, 0); return check_condition_result; From a141c17a543332fc1238eb5cba562bfc66879126 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 12 Sep 2024 10:58:28 +0200 Subject: [PATCH 159/352] scsi: pm8001: Do not overwrite PCI queue mapping blk_mq_pci_map_queues() maps all queues but right after this, we overwrite these mappings by calling blk_mq_map_queues(). Just use one helper but not both. Fixes: 42f22fe36d51 ("scsi: pm8001: Expose hardware queues for pm80xx") Reviewed-by: Christoph Hellwig Reviewed-by: John Garry Signed-off-by: Daniel Wagner Link: https://lore.kernel.org/r/20240912-do-not-overwrite-pci-mapping-v1-1-85724b6cec49@suse.de Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm8001_init.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c index 1e63cb6cd8e32..33e1eba62ca12 100644 --- a/drivers/scsi/pm8001/pm8001_init.c +++ b/drivers/scsi/pm8001/pm8001_init.c @@ -100,10 +100,12 @@ static void pm8001_map_queues(struct Scsi_Host *shost) struct pm8001_hba_info *pm8001_ha = sha->lldd_ha; struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; - if (pm8001_ha->number_of_intr > 1) + if (pm8001_ha->number_of_intr > 1) { blk_mq_pci_map_queues(qmap, pm8001_ha->pdev, 1); + return; + } - return blk_mq_map_queues(qmap); + blk_mq_map_queues(qmap); } /* From f81eaf08385ddd474a2f41595a7757502870c0eb Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Thu, 12 Sep 2024 15:43:08 +0200 Subject: [PATCH 160/352] scsi: sd: Fix off-by-one error in sd_read_block_characteristics() Ff the device returns page 0xb1 with length 8 (happens with qemu v2.x, for example), sd_read_block_characteristics() may attempt an out-of-bounds memory access when accessing the zoned field at offset 8. Fixes: 7fb019c46eee ("scsi: sd: Switch to using scsi_device VPD pages") Cc: stable@vger.kernel.org Signed-off-by: Martin Wilck Link: https://lore.kernel.org/r/20240912134308.282824-1-mwilck@suse.com Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 43a723164fc53..64aaf24c63981 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3402,7 +3402,7 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp, rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb1); - if (!vpd || vpd->len < 8) { + if (!vpd || vpd->len <= 8) { rcu_read_unlock(); return; } From 9634bb07083cfae38933d4e944709e06e4c30e74 Mon Sep 17 00:00:00 2001 From: Ranjan Kumar Date: Thu, 5 Sep 2024 15:57:49 +0530 Subject: [PATCH 161/352] scsi: mpi3mr: Enhance the Enable Controller retry logic When enabling the IOC request and polling for controller ready status, poll for controller fault and reset history bit. If the controller is faulted or the reset history bit is set, retry the initialization a maximum of three times (2 retries) or if the cumulative time taken for all retries exceeds 510 seconds. Signed-off-by: Prayas Patel Signed-off-by: Ranjan Kumar Link: https://lore.kernel.org/r/20240905102753.105310-2-ranjan.kumar@broadcom.com Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_fw.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c index 2e1a92d306b20..f589df62ead3f 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_fw.c +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c @@ -1362,6 +1362,10 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc) int retval = 0; enum mpi3mr_iocstate ioc_state; u64 base_info; + u8 retry = 0; + u64 start_time, elapsed_time_sec; + +retry_bring_ioc_ready: ioc_status = readl(&mrioc->sysif_regs->ioc_status); ioc_config = readl(&mrioc->sysif_regs->ioc_configuration); @@ -1460,6 +1464,9 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc) ioc_config |= MPI3_SYSIF_IOC_CONFIG_ENABLE_IOC; writel(ioc_config, &mrioc->sysif_regs->ioc_configuration); + if (retry == 0) + start_time = jiffies; + timeout = mrioc->ready_timeout * 10; do { ioc_state = mpi3mr_get_iocstate(mrioc); @@ -1469,6 +1476,12 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc) mpi3mr_iocstate_name(ioc_state)); return 0; } + ioc_status = readl(&mrioc->sysif_regs->ioc_status); + if ((ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) || + (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT)) { + mpi3mr_print_fault_info(mrioc); + goto out_failed; + } if (!pci_device_is_present(mrioc->pdev)) { mrioc->unrecoverable = 1; ioc_err(mrioc, @@ -1477,9 +1490,19 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc) goto out_device_not_present; } msleep(100); - } while (--timeout); + elapsed_time_sec = jiffies_to_msecs(jiffies - start_time)/1000; + } while (elapsed_time_sec < mrioc->ready_timeout); out_failed: + elapsed_time_sec = jiffies_to_msecs(jiffies - start_time)/1000; + if ((retry < 2) && (elapsed_time_sec < (mrioc->ready_timeout - 60))) { + retry++; + + ioc_warn(mrioc, "retrying to bring IOC ready, retry_count:%d\n" + " elapsed time =%llu\n", retry, elapsed_time_sec); + + goto retry_bring_ioc_ready; + } ioc_state = mpi3mr_get_iocstate(mrioc); ioc_err(mrioc, "failed to bring to ready state, current state: %s\n", From fc1ddda330941b8a1571368bcbade16d377a5eaa Mon Sep 17 00:00:00 2001 From: Ranjan Kumar Date: Thu, 5 Sep 2024 15:57:50 +0530 Subject: [PATCH 162/352] scsi: mpi3mr: Use firmware-provided timestamp update interval Make driver use the timestamp update interval value provided by firmware in the driver page 1. If firmware fails to provide non-zero value, then the driver will fall back to the driver defined macro. Signed-off-by: Prayas Patel Signed-off-by: Ranjan Kumar Link: https://lore.kernel.org/r/20240905102753.105310-3-ranjan.kumar@broadcom.com Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h | 1 + drivers/scsi/mpi3mr/mpi3mr.h | 4 +++- drivers/scsi/mpi3mr/mpi3mr_fw.c | 27 ++++++++++++++++++++++++++- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h b/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h index 4b7a8f6314a33..b46bd08eac990 100644 --- a/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h +++ b/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h @@ -1327,6 +1327,7 @@ struct mpi3_driver_page0 { struct mpi3_driver_page1 { struct mpi3_config_page_header header; __le32 flags; + u8 time_stamp_update; __le32 reserved0c; __le16 host_diag_trace_max_size; __le16 host_diag_trace_min_size; diff --git a/drivers/scsi/mpi3mr/mpi3mr.h b/drivers/scsi/mpi3mr/mpi3mr.h index 1dc640de3efc9..4b5f43af9f608 100644 --- a/drivers/scsi/mpi3mr/mpi3mr.h +++ b/drivers/scsi/mpi3mr/mpi3mr.h @@ -1090,6 +1090,7 @@ struct scmd_priv { * @evtack_cmds_bitmap: Event Ack bitmap * @delayed_evtack_cmds_list: Delayed event acknowledgment list * @ts_update_counter: Timestamp update counter + * @ts_update_interval: Timestamp update interval * @reset_in_progress: Reset in progress flag * @unrecoverable: Controller unrecoverable flag * @prev_reset_result: Result of previous reset @@ -1277,7 +1278,8 @@ struct mpi3mr_ioc { unsigned long *evtack_cmds_bitmap; struct list_head delayed_evtack_cmds_list; - u32 ts_update_counter; + u16 ts_update_counter; + u16 ts_update_interval; u8 reset_in_progress; u8 unrecoverable; int prev_reset_result; diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c index f589df62ead3f..ddb770fae9d34 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_fw.c +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c @@ -2694,7 +2694,7 @@ static void mpi3mr_watchdog_work(struct work_struct *work) return; } - if (mrioc->ts_update_counter++ >= MPI3MR_TSUPDATE_INTERVAL) { + if (mrioc->ts_update_counter++ >= mrioc->ts_update_interval) { mrioc->ts_update_counter = 0; mpi3mr_sync_timestamp(mrioc); } @@ -3867,6 +3867,29 @@ static int mpi3mr_repost_diag_bufs(struct mpi3mr_ioc *mrioc) return retval; } +/** + * mpi3mr_read_tsu_interval - Update time stamp interval + * @mrioc: Adapter instance reference + * + * Update time stamp interval if its defined in driver page 1, + * otherwise use default value. + * + * Return: Nothing + */ +static void +mpi3mr_read_tsu_interval(struct mpi3mr_ioc *mrioc) +{ + struct mpi3_driver_page1 driver_pg1; + u16 pg_sz = sizeof(driver_pg1); + int retval = 0; + + mrioc->ts_update_interval = MPI3MR_TSUPDATE_INTERVAL; + + retval = mpi3mr_cfg_get_driver_pg1(mrioc, &driver_pg1, pg_sz); + if (!retval && driver_pg1.time_stamp_update) + mrioc->ts_update_interval = (driver_pg1.time_stamp_update * 60); +} + /** * mpi3mr_print_ioc_info - Display controller information * @mrioc: Adapter instance reference @@ -4163,6 +4186,7 @@ int mpi3mr_init_ioc(struct mpi3mr_ioc *mrioc) goto out_failed_noretry; } + mpi3mr_read_tsu_interval(mrioc); mpi3mr_print_ioc_info(mrioc); if (!mrioc->cfg_page) { @@ -4344,6 +4368,7 @@ int mpi3mr_reinit_ioc(struct mpi3mr_ioc *mrioc, u8 is_resume) goto out_failed_noretry; } + mpi3mr_read_tsu_interval(mrioc); mpi3mr_print_ioc_info(mrioc); if (is_resume) { From 6e4c825f267ed151596fe2554cfa457b9aaa7edf Mon Sep 17 00:00:00 2001 From: Ranjan Kumar Date: Thu, 5 Sep 2024 15:57:51 +0530 Subject: [PATCH 163/352] scsi: mpi3mr: Update MPI Headers to revision 34 Update MPI Headers to revision 34. Signed-off-by: Prayas Patel Signed-off-by: Ranjan Kumar Link: https://lore.kernel.org/r/20240905102753.105310-4-ranjan.kumar@broadcom.com Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h | 34 +++++++++++++++++++++-- drivers/scsi/mpi3mr/mpi/mpi30_image.h | 13 +++++++-- drivers/scsi/mpi3mr/mpi/mpi30_ioc.h | 8 ++++++ drivers/scsi/mpi3mr/mpi/mpi30_transport.h | 4 ++- 4 files changed, 52 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h b/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h index b46bd08eac990..00cd18edfad6d 100644 --- a/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h +++ b/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h @@ -67,6 +67,7 @@ #define MPI3_SECURITY_PGAD_SLOT_GROUP_MASK (0x0000ff00) #define MPI3_SECURITY_PGAD_SLOT_GROUP_SHIFT (8) #define MPI3_SECURITY_PGAD_SLOT_MASK (0x000000ff) +#define MPI3_INSTANCE_PGAD_INSTANCE_MASK (0x0000ffff) struct mpi3_config_request { __le16 host_tag; u8 ioc_use_only02; @@ -75,7 +76,8 @@ struct mpi3_config_request { u8 ioc_use_only06; u8 msg_flags; __le16 change_count; - __le16 reserved0a; + u8 proxy_ioc_number; + u8 reserved0b; u8 page_version; u8 page_number; u8 page_type; @@ -206,6 +208,9 @@ struct mpi3_config_page_header { #define MPI3_MFGPAGE_DEVID_SAS5116_MPI_MGMT (0x00b5) #define MPI3_MFGPAGE_DEVID_SAS5116_NVME_MGMT (0x00b6) #define MPI3_MFGPAGE_DEVID_SAS5116_PCIE_SWITCH (0x00b8) +#define MPI3_MFGPAGE_DEVID_SAS5248_MPI (0x00f0) +#define MPI3_MFGPAGE_DEVID_SAS5248_MPI_NS (0x00f1) +#define MPI3_MFGPAGE_DEVID_SAS5248_PCIE_SWITCH (0x00f2) struct mpi3_man_page0 { struct mpi3_config_page_header header; u8 chip_revision[8]; @@ -1074,6 +1079,8 @@ struct mpi3_io_unit_page8 { #define MPI3_IOUNIT8_SBSTATE_SVN_UPDATE_PENDING (0x04) #define MPI3_IOUNIT8_SBSTATE_KEY_UPDATE_PENDING (0x02) #define MPI3_IOUNIT8_SBSTATE_SECURE_BOOT_ENABLED (0x01) +#define MPI3_IOUNIT8_SBMODE_CURRENT_KEY_IOUNIT17 (0x10) +#define MPI3_IOUNIT8_SBMODE_HARD_SECURE_RECERTIFIED (0x08) struct mpi3_io_unit_page9 { struct mpi3_config_page_header header; __le32 flags; @@ -1089,6 +1096,8 @@ struct mpi3_io_unit_page9 { #define MPI3_IOUNIT9_FLAGS_UBM_ENCLOSURE_ORDER_BACKPLANE_TYPE (0x00000004) #define MPI3_IOUNIT9_FLAGS_VDFIRST_ENABLED (0x00000001) #define MPI3_IOUNIT9_FIRSTDEVICE_UNKNOWN (0xffff) +#define MPI3_IOUNIT9_FIRSTDEVICE_IN_DRIVER_PAGE_0 (0xfffe) + struct mpi3_io_unit_page10 { struct mpi3_config_page_header header; u8 flags; @@ -1224,6 +1233,19 @@ struct mpi3_io_unit_page15 { #define MPI3_IOUNIT15_FLAGS_EPRSUPPORT_WITHOUT_POWER_BRAKE_GPIO (0x01) #define MPI3_IOUNIT15_FLAGS_EPRSUPPORT_WITH_POWER_BRAKE_GPIO (0x02) #define MPI3_IOUNIT15_NUMPOWERBUDGETDATA_POWER_BUDGETING_DISABLED (0x00) + +struct mpi3_io_unit_page17 { + struct mpi3_config_page_header header; + u8 num_instances; + u8 instance; + __le16 reserved0a; + __le32 reserved0c[4]; + __le16 key_length; + u8 encryption_algorithm; + u8 reserved1f; + __le32 current_key[]; +}; +#define MPI3_IOUNIT17_PAGEVERSION (0x00) struct mpi3_ioc_page0 { struct mpi3_config_page_header header; __le32 reserved08; @@ -1311,7 +1333,7 @@ struct mpi3_driver_page0 { u8 tur_interval; u8 reserved10; u8 security_key_timeout; - __le16 reserved12; + __le16 first_device; __le32 reserved14; __le32 reserved18; }; @@ -1324,11 +1346,13 @@ struct mpi3_driver_page0 { #define MPI3_DRIVER0_BSDOPTS_REGISTRATION_IOC_AND_DEVS (0x00000000) #define MPI3_DRIVER0_BSDOPTS_REGISTRATION_IOC_ONLY (0x00000001) #define MPI3_DRIVER0_BSDOPTS_REGISTRATION_IOC_AND_INTERNAL_DEVS (0x00000002) +#define MPI3_DRIVER0_FIRSTDEVICE_IGNORE1 (0x0000) +#define MPI3_DRIVER0_FIRSTDEVICE_IGNORE2 (0xffff) struct mpi3_driver_page1 { struct mpi3_config_page_header header; __le32 flags; u8 time_stamp_update; - __le32 reserved0c; + u8 reserved0d[3]; __le16 host_diag_trace_max_size; __le16 host_diag_trace_min_size; __le16 host_diag_trace_decrement_size; @@ -2348,6 +2372,10 @@ struct mpi3_device0_vd_format { #define MPI3_DEVICE0_VD_DEVICE_INFO_SAS (0x0001) #define MPI3_DEVICE0_VD_FLAGS_IO_THROTTLE_GROUP_QD_MASK (0xf000) #define MPI3_DEVICE0_VD_FLAGS_IO_THROTTLE_GROUP_QD_SHIFT (12) +#define MPI3_DEVICE0_VD_FLAGS_OSEXPOSURE_MASK (0x0003) +#define MPI3_DEVICE0_VD_FLAGS_OSEXPOSURE_HDD (0x0000) +#define MPI3_DEVICE0_VD_FLAGS_OSEXPOSURE_SSD (0x0001) +#define MPI3_DEVICE0_VD_FLAGS_OSEXPOSURE_NO_GUIDANCE (0x0002) union mpi3_device0_dev_spec_format { struct mpi3_device0_sas_sata_format sas_sata_format; struct mpi3_device0_pcie_format pcie_format; diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_image.h b/drivers/scsi/mpi3mr/mpi/mpi30_image.h index 7df2421901357..2c6e548cbd0ff 100644 --- a/drivers/scsi/mpi3mr/mpi/mpi30_image.h +++ b/drivers/scsi/mpi3mr/mpi/mpi30_image.h @@ -205,13 +205,14 @@ struct mpi3_encrypted_hash_entry { u8 hash_image_type; u8 hash_algorithm; u8 encryption_algorithm; - u8 reserved03; + u8 flags; __le16 public_key_size; __le16 signature_size; __le32 public_key[MPI3_PUBLIC_KEY_MAX]; }; - -#define MPI3_HASH_IMAGE_TYPE_KEY_WITH_SIGNATURE (0x03) +#define MPI3_HASH_IMAGE_TYPE_KEY_WITH_HASH (0x03) +#define MPI3_HASH_IMAGE_TYPE_KEY_WITH_HASH_1_OF_2 (0x04) +#define MPI3_HASH_IMAGE_TYPE_KEY_WITH_HASH_2_OF_2 (0x05) #define MPI3_HASH_ALGORITHM_VERSION_MASK (0xe0) #define MPI3_HASH_ALGORITHM_VERSION_NONE (0x00) #define MPI3_HASH_ALGORITHM_VERSION_SHA1 (0x20) @@ -230,6 +231,12 @@ struct mpi3_encrypted_hash_entry { #define MPI3_ENCRYPTION_ALGORITHM_RSA4096 (0x05) #define MPI3_ENCRYPTION_ALGORITHM_RSA3072 (0x06) +/* hierarchical signature system (hss) */ +#define MPI3_ENCRYPTION_ALGORITHM_ML_DSA_87 (0x0b) +#define MPI3_ENCRYPTION_ALGORITHM_ML_DSA_65 (0x0c) +#define MPI3_ENCRYPTION_ALGORITHM_ML_DSA_44 (0x0d) +#define MPI3_ENCRYPTED_HASH_ENTRY_FLAGS_PAIRED_KEY_MASK (0x0f) + #ifndef MPI3_ENCRYPTED_HASH_ENTRY_MAX #define MPI3_ENCRYPTED_HASH_ENTRY_MAX (1) #endif diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_ioc.h b/drivers/scsi/mpi3mr/mpi/mpi30_ioc.h index c9fa0d69b75f1..c374867f9ba08 100644 --- a/drivers/scsi/mpi3mr/mpi/mpi30_ioc.h +++ b/drivers/scsi/mpi3mr/mpi/mpi30_ioc.h @@ -39,6 +39,12 @@ struct mpi3_ioc_init_request { #define MPI3_WHOINIT_HOST_DRIVER (0x03) #define MPI3_WHOINIT_MANUFACTURER (0x04) +#define MPI3_IOCINIT_DRIVERCAP_OSEXPOSURE_MASK (0x00000003) +#define MPI3_IOCINIT_DRIVERCAP_OSEXPOSURE_NO_GUIDANCE (0x00000000) +#define MPI3_IOCINIT_DRIVERCAP_OSEXPOSURE_NO_SPECIAL (0x00000001) +#define MPI3_IOCINIT_DRIVERCAP_OSEXPOSURE_REPORT_AS_HDD (0x00000002) +#define MPI3_IOCINIT_DRIVERCAP_OSEXPOSURE_REPORT_AS_SSD (0x00000003) + struct mpi3_ioc_facts_request { __le16 host_tag; u8 ioc_use_only02; @@ -140,6 +146,8 @@ struct mpi3_ioc_facts_data { #define MPI3_IOCFACTS_EXCEPT_MANUFACT_CHECKSUM_FAIL (0x0020) #define MPI3_IOCFACTS_EXCEPT_FW_CHECKSUM_FAIL (0x0010) #define MPI3_IOCFACTS_EXCEPT_CONFIG_CHECKSUM_FAIL (0x0008) +#define MPI3_IOCFACTS_EXCEPT_BLOCKING_BOOT_EVENT (0x0004) +#define MPI3_IOCFACTS_EXCEPT_SECURITY_SELFTEST_FAILURE (0x0002) #define MPI3_IOCFACTS_EXCEPT_BOOTSTAT_MASK (0x0001) #define MPI3_IOCFACTS_EXCEPT_BOOTSTAT_PRIMARY (0x0000) #define MPI3_IOCFACTS_EXCEPT_BOOTSTAT_SECONDARY (0x0001) diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_transport.h b/drivers/scsi/mpi3mr/mpi/mpi30_transport.h index fdc3d1968e430..b2ab25a1cfebe 100644 --- a/drivers/scsi/mpi3mr/mpi/mpi30_transport.h +++ b/drivers/scsi/mpi3mr/mpi/mpi30_transport.h @@ -18,7 +18,7 @@ union mpi3_version_union { #define MPI3_VERSION_MAJOR (3) #define MPI3_VERSION_MINOR (0) -#define MPI3_VERSION_UNIT (31) +#define MPI3_VERSION_UNIT (34) #define MPI3_VERSION_DEV (0) #define MPI3_DEVHANDLE_INVALID (0xffff) struct mpi3_sysif_oper_queue_indexes { @@ -158,6 +158,7 @@ struct mpi3_sysif_registers { #define MPI3_SYSIF_FAULT_CODE_SOFT_RESET_NEEDED (0x0000f004) #define MPI3_SYSIF_FAULT_CODE_POWER_CYCLE_REQUIRED (0x0000f005) #define MPI3_SYSIF_FAULT_CODE_TEMP_THRESHOLD_EXCEEDED (0x0000f006) +#define MPI3_SYSIF_FAULT_CODE_INSUFFICIENT_PCI_SLOT_POWER (0x0000f007) #define MPI3_SYSIF_FAULT_INFO0_OFFSET (0x00001c14) #define MPI3_SYSIF_FAULT_INFO1_OFFSET (0x00001c18) #define MPI3_SYSIF_FAULT_INFO2_OFFSET (0x00001c1c) @@ -410,6 +411,7 @@ struct mpi3_default_reply { #define MPI3_IOCSTATUS_INSUFFICIENT_RESOURCES (0x0006) #define MPI3_IOCSTATUS_INVALID_FIELD (0x0007) #define MPI3_IOCSTATUS_INVALID_STATE (0x0008) +#define MPI3_IOCSTATUS_SHUTDOWN_ACTIVE (0x0009) #define MPI3_IOCSTATUS_INSUFFICIENT_POWER (0x000a) #define MPI3_IOCSTATUS_INVALID_CHANGE_COUNT (0x000b) #define MPI3_IOCSTATUS_ALLOWED_CMD_BLOCK (0x000c) From 4616a4b3cb8aa736882f8dc392cf146365ead2a4 Mon Sep 17 00:00:00 2001 From: Ranjan Kumar Date: Thu, 5 Sep 2024 15:57:52 +0530 Subject: [PATCH 164/352] scsi: mpi3mr: Improve wait logic while controller transitions to READY state During controller transitioning to READY state, if the controller is found in transient states ("becoming ready" or "reset requested"), driver waits for 510 secs even if the controller transitions out of these states early. This causes an unnecessary delay of 510 secs in the overall firmware initialization sequence. Poll the controller state periodically (every 100 milliseconds) while waiting for the controller to come out of the mentioned transient states. Once the controller transits out of the transient states, come out of the wait loop. Signed-off-by: Sumit Saxena Signed-off-by: Ranjan Kumar Link: https://lore.kernel.org/r/20240905102753.105310-5-ranjan.kumar@broadcom.com Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_fw.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c index ddb770fae9d34..86f7136cd5aee 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_fw.c +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c @@ -1384,26 +1384,23 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc) ioc_info(mrioc, "controller is in %s state during detection\n", mpi3mr_iocstate_name(ioc_state)); - if (ioc_state == MRIOC_STATE_BECOMING_READY || - ioc_state == MRIOC_STATE_RESET_REQUESTED) { - timeout = mrioc->ready_timeout * 10; - do { - msleep(100); - } while (--timeout); + timeout = mrioc->ready_timeout * 10; + + do { + ioc_state = mpi3mr_get_iocstate(mrioc); + + if (ioc_state != MRIOC_STATE_BECOMING_READY && + ioc_state != MRIOC_STATE_RESET_REQUESTED) + break; if (!pci_device_is_present(mrioc->pdev)) { mrioc->unrecoverable = 1; - ioc_err(mrioc, - "controller is not present while waiting to reset\n"); - retval = -1; + ioc_err(mrioc, "controller is not present while waiting to reset\n"); goto out_device_not_present; } - ioc_state = mpi3mr_get_iocstate(mrioc); - ioc_info(mrioc, - "controller is in %s state after waiting to reset\n", - mpi3mr_iocstate_name(ioc_state)); - } + msleep(100); + } while (--timeout); if (ioc_state == MRIOC_STATE_READY) { ioc_info(mrioc, "issuing message unit reset (MUR) to bring to reset state\n"); From e7d67f3f9f9c89726c2ebcf5fce48db7798a49df Mon Sep 17 00:00:00 2001 From: Ranjan Kumar Date: Thu, 5 Sep 2024 15:57:53 +0530 Subject: [PATCH 165/352] scsi: mpi3mr: Update driver version to 8.12.0.0.50 Update driver version to 8.12.0.0.50. Signed-off-by: Ranjan Kumar Link: https://lore.kernel.org/r/20240905102753.105310-6-ranjan.kumar@broadcom.com Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/mpi3mr/mpi3mr.h b/drivers/scsi/mpi3mr/mpi3mr.h index 4b5f43af9f608..856c0928b13b9 100644 --- a/drivers/scsi/mpi3mr/mpi3mr.h +++ b/drivers/scsi/mpi3mr/mpi3mr.h @@ -57,8 +57,8 @@ extern struct list_head mrioc_list; extern int prot_mask; extern atomic64_t event_counter; -#define MPI3MR_DRIVER_VERSION "8.10.0.5.50" -#define MPI3MR_DRIVER_RELDATE "08-Aug-2024" +#define MPI3MR_DRIVER_VERSION "8.12.0.0.50" +#define MPI3MR_DRIVER_RELDATE "05-Sept-2024" #define MPI3MR_DRIVER_NAME "mpi3mr" #define MPI3MR_DRIVER_LICENSE "GPL" From 93bcc5f3984bf4f51da1529700aec351872dbfff Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Thu, 12 Sep 2024 16:24:40 -0700 Subject: [PATCH 166/352] scsi: lpfc: Add ELS_RSP cmd to the list of WQEs to flush in lpfc_els_flush_cmd() During HBA stress testing, a spam of received PLOGIs exposes a resource recovery bug causing leakage of lpfc_sqlq entries from the global phba->sli4_hba.lpfc_els_sgl_list. The issue is in lpfc_els_flush_cmd(), where the driver attempts to recover outstanding ELS sgls when walking the txcmplq. Only CMD_ELS_REQUEST64_CRs and CMD_GEN_REQUEST64_CRs are added to the abort and cancel lists. A check for CMD_XMIT_ELS_RSP64_WQE is missing in order to recover LS_ACC usages of the phba->sli4_hba.lpfc_els_sgl_list too. Fix by adding CMD_XMIT_ELS_RSP64_WQE as part of the txcmplq walk when adding WQEs to the abort and cancel list in lpfc_els_flush_cmd(). Also, update naming convention from CRs to WQEs. Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20240912232447.45607-2-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_els.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index de0ec945d2f1e..7219b1ada1ea3 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -9658,11 +9658,12 @@ lpfc_els_flush_cmd(struct lpfc_vport *vport) if (piocb->cmd_flag & LPFC_DRIVER_ABORTED && !mbx_tmo_err) continue; - /* On the ELS ring we can have ELS_REQUESTs or - * GEN_REQUESTs waiting for a response. + /* On the ELS ring we can have ELS_REQUESTs, ELS_RSPs, + * or GEN_REQUESTs waiting for a CQE response. */ ulp_command = get_job_cmnd(phba, piocb); - if (ulp_command == CMD_ELS_REQUEST64_CR) { + if (ulp_command == CMD_ELS_REQUEST64_WQE || + ulp_command == CMD_XMIT_ELS_RSP64_WQE) { list_add_tail(&piocb->dlist, &abort_list); /* If the link is down when flushing ELS commands From fc318cac66ac50398f9fc7cbec7b339e6d08a7e6 Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Thu, 12 Sep 2024 16:24:41 -0700 Subject: [PATCH 167/352] scsi: lpfc: Update phba link state conditional before sending CMF_SYNC_WQE It's possible for the driver to send a CMF_SYNC_WQE to nonresponsive firmware during reset of the adapter. The phba link_state conditional check is currently a strict == LPFC_LINK_DOWN, which does not cover initialization states before reaching the LPFC_LINK_UP state. Update the phba->link_state conditional to < LPFC_LINK_UP so that all initialization states are covered before allowing sending CMF_SYNC_WQE. Update taking of the hbalock to be during this link_state check as well. Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20240912232447.45607-3-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_sli.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 332b8d2348e9e..bb5fd3322273e 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -1940,12 +1940,15 @@ lpfc_issue_cmf_sync_wqe(struct lpfc_hba *phba, u32 ms, u64 total) atot = atomic_xchg(&phba->cgn_sync_alarm_cnt, 0); wtot = atomic_xchg(&phba->cgn_sync_warn_cnt, 0); + spin_lock_irqsave(&phba->hbalock, iflags); + /* ONLY Managed mode will send the CMF_SYNC_WQE to the HBA */ if (phba->cmf_active_mode != LPFC_CFG_MANAGED || - phba->link_state == LPFC_LINK_DOWN) - return 0; + phba->link_state < LPFC_LINK_UP) { + ret_val = 0; + goto out_unlock; + } - spin_lock_irqsave(&phba->hbalock, iflags); sync_buf = __lpfc_sli_get_iocbq(phba); if (!sync_buf) { lpfc_printf_log(phba, KERN_ERR, LOG_CGN_MGMT, From 05ab4e7846f1103377133c00295a9a910cc6dfc2 Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Thu, 12 Sep 2024 16:24:42 -0700 Subject: [PATCH 168/352] scsi: lpfc: Restrict support for 32 byte CDBs to specific HBAs An older generation of HBAs are failing FCP discovery due to usage of an outdated field in FCP command WQEs. Fix by checking the SLI Interface Type register for applicable support of 32 Byte CDB commands, and restore a setting for a WQE path using normal 16 byte CDBs. Fixes: af20bb73ac25 ("scsi: lpfc: Add support for 32 byte CDBs") Cc: stable@vger.kernel.org # v6.10+ Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20240912232447.45607-4-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_hw4.h | 3 +++ drivers/scsi/lpfc/lpfc_init.c | 21 ++++++++++++++++++--- drivers/scsi/lpfc/lpfc_scsi.c | 2 +- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_hw4.h b/drivers/scsi/lpfc/lpfc_hw4.h index 500253007b1dc..26e1313ebb21f 100644 --- a/drivers/scsi/lpfc/lpfc_hw4.h +++ b/drivers/scsi/lpfc/lpfc_hw4.h @@ -4847,6 +4847,7 @@ struct fcp_iwrite64_wqe { #define cmd_buff_len_SHIFT 16 #define cmd_buff_len_MASK 0x00000ffff #define cmd_buff_len_WORD word3 +/* Note: payload_offset_len field depends on ASIC support */ #define payload_offset_len_SHIFT 0 #define payload_offset_len_MASK 0x0000ffff #define payload_offset_len_WORD word3 @@ -4863,6 +4864,7 @@ struct fcp_iread64_wqe { #define cmd_buff_len_SHIFT 16 #define cmd_buff_len_MASK 0x00000ffff #define cmd_buff_len_WORD word3 +/* Note: payload_offset_len field depends on ASIC support */ #define payload_offset_len_SHIFT 0 #define payload_offset_len_MASK 0x0000ffff #define payload_offset_len_WORD word3 @@ -4879,6 +4881,7 @@ struct fcp_icmnd64_wqe { #define cmd_buff_len_SHIFT 16 #define cmd_buff_len_MASK 0x00000ffff #define cmd_buff_len_WORD word3 +/* Note: payload_offset_len field depends on ASIC support */ #define payload_offset_len_SHIFT 0 #define payload_offset_len_MASK 0x0000ffff #define payload_offset_len_WORD word3 diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index e1dfa96c2a553..0c1404dc5f3bd 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -4699,6 +4699,7 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev) uint64_t wwn; bool use_no_reset_hba = false; int rc; + u8 if_type; if (lpfc_no_hba_reset_cnt) { if (phba->sli_rev < LPFC_SLI_REV4 && @@ -4773,10 +4774,24 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev) shost->max_id = LPFC_MAX_TARGET; shost->max_lun = vport->cfg_max_luns; shost->this_id = -1; - if (phba->sli_rev == LPFC_SLI_REV4) - shost->max_cmd_len = LPFC_FCP_CDB_LEN_32; - else + + /* Set max_cmd_len applicable to ASIC support */ + if (phba->sli_rev == LPFC_SLI_REV4) { + if_type = bf_get(lpfc_sli_intf_if_type, + &phba->sli4_hba.sli_intf); + switch (if_type) { + case LPFC_SLI_INTF_IF_TYPE_2: + fallthrough; + case LPFC_SLI_INTF_IF_TYPE_6: + shost->max_cmd_len = LPFC_FCP_CDB_LEN_32; + break; + default: + shost->max_cmd_len = LPFC_FCP_CDB_LEN; + break; + } + } else { shost->max_cmd_len = LPFC_FCP_CDB_LEN; + } if (phba->sli_rev == LPFC_SLI_REV4) { if (!phba->cfg_fcp_mq_threshold || diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c index 60cd60ebff38e..0eaede8275dac 100644 --- a/drivers/scsi/lpfc/lpfc_scsi.c +++ b/drivers/scsi/lpfc/lpfc_scsi.c @@ -4760,7 +4760,7 @@ static int lpfc_scsi_prep_cmnd_buf_s4(struct lpfc_vport *vport, /* Word 3 */ bf_set(payload_offset_len, &wqe->fcp_icmd, - sizeof(struct fcp_cmnd32) + sizeof(struct fcp_rsp)); + sizeof(struct fcp_cmnd) + sizeof(struct fcp_rsp)); /* Word 6 */ bf_set(wqe_ctxt_tag, &wqe->generic.wqe_com, From d1a2ef63fc8b3e3dc5b74b7e08636896b32acbdc Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Thu, 12 Sep 2024 16:24:43 -0700 Subject: [PATCH 169/352] scsi: lpfc: Fix kref imbalance on fabric ndlps from dev_loss_tmo handler With a FLOGI outstanding and loss of physical link connection to the fabric for the duration of dev_loss_tmo, there is a fabric ndlp kref imbalance that decrements the kref and sets the NLP_IN_RECOV_POST_DEV_LOSS flag at the same time. The issue is that when the FLOGI completion routine executes, the fabric ndlp could already be freed because of the final kref put from the dev_loss_tmo handler. Fix by early returning before the ndlp kref put if the ndlp is deemed a candidate for NLP_IN_RECOV_POST_DEV_LOSS in the FLOGI completion routine. Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20240912232447.45607-5-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_hbadisc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index 35c9181c6608a..9241075f72fa4 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -527,6 +527,9 @@ lpfc_dev_loss_tmo_handler(struct lpfc_nodelist *ndlp) * the following lpfc_nlp_put is necessary after fabric node is * recovered. */ + spin_lock_irqsave(&ndlp->lock, iflags); + ndlp->nlp_flag &= ~NLP_IN_DEV_LOSS; + spin_unlock_irqrestore(&ndlp->lock, iflags); if (recovering) { lpfc_printf_vlog(vport, KERN_INFO, LOG_DISCOVERY | LOG_NODE, @@ -539,6 +542,7 @@ lpfc_dev_loss_tmo_handler(struct lpfc_nodelist *ndlp) spin_lock_irqsave(&ndlp->lock, iflags); ndlp->save_flags |= NLP_IN_RECOV_POST_DEV_LOSS; spin_unlock_irqrestore(&ndlp->lock, iflags); + return fcf_inuse; } else if (ndlp->nlp_state == NLP_STE_UNMAPPED_NODE) { /* Fabric node fully recovered before this dev_loss_tmo * queue work is processed. Thus, ignore the @@ -552,15 +556,9 @@ lpfc_dev_loss_tmo_handler(struct lpfc_nodelist *ndlp) ndlp->nlp_DID, kref_read(&ndlp->kref), ndlp, ndlp->nlp_flag, vport->port_state); - spin_lock_irqsave(&ndlp->lock, iflags); - ndlp->nlp_flag &= ~NLP_IN_DEV_LOSS; - spin_unlock_irqrestore(&ndlp->lock, iflags); return fcf_inuse; } - spin_lock_irqsave(&ndlp->lock, iflags); - ndlp->nlp_flag &= ~NLP_IN_DEV_LOSS; - spin_unlock_irqrestore(&ndlp->lock, iflags); lpfc_nlp_put(ndlp); return fcf_inuse; } From 0a3c84f71680684c1d41abb92db05f95c09111e8 Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Thu, 12 Sep 2024 16:24:44 -0700 Subject: [PATCH 170/352] scsi: lpfc: Ensure DA_ID handling completion before deleting an NPIV instance Deleting an NPIV instance requires all fabric ndlps to be released before an NPIV's resources can be torn down. Failure to release fabric ndlps beforehand opens kref imbalance race conditions. Fix by forcing the DA_ID to complete synchronously with usage of wait_queue. Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20240912232447.45607-6-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_ct.c | 12 ++++++++++ drivers/scsi/lpfc/lpfc_disc.h | 7 ++++++ drivers/scsi/lpfc/lpfc_vport.c | 43 ++++++++++++++++++++++++++++------ 3 files changed, 55 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c index 2dedd1493e5ba..1e5db489a00c5 100644 --- a/drivers/scsi/lpfc/lpfc_ct.c +++ b/drivers/scsi/lpfc/lpfc_ct.c @@ -1647,6 +1647,18 @@ lpfc_cmpl_ct(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, } out: + /* If the caller wanted a synchronous DA_ID completion, signal the + * wait obj and clear flag to reset the vport. + */ + if (ndlp->save_flags & NLP_WAIT_FOR_DA_ID) { + if (ndlp->da_id_waitq) + wake_up(ndlp->da_id_waitq); + } + + spin_lock_irq(&ndlp->lock); + ndlp->save_flags &= ~NLP_WAIT_FOR_DA_ID; + spin_unlock_irq(&ndlp->lock); + lpfc_ct_free_iocb(phba, cmdiocb); lpfc_nlp_put(ndlp); return; diff --git a/drivers/scsi/lpfc/lpfc_disc.h b/drivers/scsi/lpfc/lpfc_disc.h index f82615d87c4bb..f5ae8cc158205 100644 --- a/drivers/scsi/lpfc/lpfc_disc.h +++ b/drivers/scsi/lpfc/lpfc_disc.h @@ -90,6 +90,8 @@ enum lpfc_nlp_save_flags { NLP_IN_RECOV_POST_DEV_LOSS = 0x1, /* wait for outstanding LOGO to cmpl */ NLP_WAIT_FOR_LOGO = 0x2, + /* wait for outstanding DA_ID to finish */ + NLP_WAIT_FOR_DA_ID = 0x4 }; struct lpfc_nodelist { @@ -159,7 +161,12 @@ struct lpfc_nodelist { uint32_t nvme_fb_size; /* NVME target's supported byte cnt */ #define NVME_FB_BIT_SHIFT 9 /* PRLI Rsp first burst in 512B units. */ uint32_t nlp_defer_did; + + /* These wait objects are NPIV specific. These IOs must complete + * synchronously. + */ wait_queue_head_t *logo_waitq; + wait_queue_head_t *da_id_waitq; }; struct lpfc_node_rrq { diff --git a/drivers/scsi/lpfc/lpfc_vport.c b/drivers/scsi/lpfc/lpfc_vport.c index 4439167a51882..7a4d4d8e2ad55 100644 --- a/drivers/scsi/lpfc/lpfc_vport.c +++ b/drivers/scsi/lpfc/lpfc_vport.c @@ -626,6 +626,7 @@ lpfc_vport_delete(struct fc_vport *fc_vport) struct Scsi_Host *shost = lpfc_shost_from_vport(vport); struct lpfc_hba *phba = vport->phba; int rc; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(waitq); if (vport->port_type == LPFC_PHYSICAL_PORT) { lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT, @@ -679,21 +680,49 @@ lpfc_vport_delete(struct fc_vport *fc_vport) if (!ndlp) goto skip_logo; + /* Send the DA_ID and Fabric LOGO to cleanup the NPIV fabric entries. */ if (ndlp && ndlp->nlp_state == NLP_STE_UNMAPPED_NODE && phba->link_state >= LPFC_LINK_UP && phba->fc_topology != LPFC_TOPOLOGY_LOOP) { if (vport->cfg_enable_da_id) { - /* Send DA_ID and wait for a completion. */ + /* Send DA_ID and wait for a completion. This is best + * effort. If the DA_ID fails, likely the fabric will + * "leak" NportIDs but at least the driver issued the + * command. + */ + ndlp = lpfc_findnode_did(vport, NameServer_DID); + if (!ndlp) + goto issue_logo; + + spin_lock_irq(&ndlp->lock); + ndlp->da_id_waitq = &waitq; + ndlp->save_flags |= NLP_WAIT_FOR_DA_ID; + spin_unlock_irq(&ndlp->lock); + rc = lpfc_ns_cmd(vport, SLI_CTNS_DA_ID, 0, 0); - if (rc) { - lpfc_printf_log(vport->phba, KERN_WARNING, - LOG_VPORT, - "1829 CT command failed to " - "delete objects on fabric, " - "rc %d\n", rc); + if (!rc) { + wait_event_timeout(waitq, + !(ndlp->save_flags & NLP_WAIT_FOR_DA_ID), + msecs_to_jiffies(phba->fc_ratov * 2000)); } + + lpfc_printf_vlog(vport, KERN_INFO, LOG_VPORT | LOG_ELS, + "1829 DA_ID issue status %d. " + "SFlag x%x NState x%x, NFlag x%x " + "Rpi x%x\n", + rc, ndlp->save_flags, ndlp->nlp_state, + ndlp->nlp_flag, ndlp->nlp_rpi); + + /* Remove the waitq and save_flags. It no + * longer matters if the wake happened. + */ + spin_lock_irq(&ndlp->lock); + ndlp->da_id_waitq = NULL; + ndlp->save_flags &= ~NLP_WAIT_FOR_DA_ID; + spin_unlock_irq(&ndlp->lock); } +issue_logo: /* * If the vpi is not registered, then a valid FDISC doesn't * exist and there is no need for a ELS LOGO. Just cleanup From 1af9af1f8ab38f1285b27581a5e6920ec58296ba Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Thu, 12 Sep 2024 16:24:45 -0700 Subject: [PATCH 171/352] scsi: lpfc: Revise TRACE_EVENT log flag severities from KERN_ERR to KERN_WARNING Revise certain log messages marked as KERN_ERR LOG_TRACE_EVENT to KERN_WARNING and use the lpfc_vlog_msg() macro to still log the event. The benefit is that events of interest are still logged and the entire trace buffer is not dumped with extraneous logging information when using default lpfc_log_verbose driver parameter settings. Also, delete the keyword "fail" from such log messages as they aren't really causes for concern. The log messages are more for warnings to a SAN admin about SAN activity. Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20240912232447.45607-7-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_ct.c | 10 +-- drivers/scsi/lpfc/lpfc_els.c | 125 +++++++++++++++++------------------ 2 files changed, 64 insertions(+), 71 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c index 1e5db489a00c5..134bc96dd1340 100644 --- a/drivers/scsi/lpfc/lpfc_ct.c +++ b/drivers/scsi/lpfc/lpfc_ct.c @@ -1572,8 +1572,8 @@ lpfc_cmpl_ct_cmd_gft_id(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, } } } else - lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT, - "3065 GFT_ID failed x%08x\n", ulp_status); + lpfc_vlog_msg(vport, KERN_WARNING, LOG_DISCOVERY, + "3065 GFT_ID status x%08x\n", ulp_status); out: lpfc_ct_free_iocb(phba, cmdiocb); @@ -2258,7 +2258,7 @@ lpfc_cmpl_ct_disc_fdmi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, } lpfc_printf_vlog(vport, KERN_INFO, LOG_DISCOVERY, - "0229 FDMI cmd %04x failed, latt = %d " + "0229 FDMI cmd %04x latt = %d " "ulp_status: x%x, rid x%x\n", be16_to_cpu(fdmi_cmd), latt, ulp_status, ulp_word4); @@ -2275,9 +2275,9 @@ lpfc_cmpl_ct_disc_fdmi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, /* Check for a CT LS_RJT response */ cmd = be16_to_cpu(fdmi_cmd); if (be16_to_cpu(fdmi_rsp) == SLI_CT_RESPONSE_FS_RJT) { - /* FDMI rsp failed */ + /* Log FDMI reject */ lpfc_printf_vlog(vport, KERN_INFO, LOG_DISCOVERY | LOG_ELS, - "0220 FDMI cmd failed FS_RJT Data: x%x", cmd); + "0220 FDMI cmd FS_RJT Data: x%x", cmd); /* Should we fallback to FDMI-2 / FDMI-1 ? */ switch (cmd) { diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 7219b1ada1ea3..d737b897ddd82 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -979,7 +979,7 @@ lpfc_cmpl_els_flogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, phba->fcoe_cvl_eventtag_attn = phba->fcoe_cvl_eventtag; lpfc_printf_log(phba, KERN_WARNING, LOG_FIP | LOG_ELS, - "2611 FLOGI failed on FCF (x%x), " + "2611 FLOGI FCF (x%x), " "status:x%x/x%x, tmo:x%x, perform " "roundrobin FCF failover\n", phba->fcf.current_rec.fcf_indx, @@ -997,11 +997,11 @@ lpfc_cmpl_els_flogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, if (!(ulp_status == IOSTAT_LOCAL_REJECT && ((ulp_word4 & IOERR_PARAM_MASK) == IOERR_LOOP_OPEN_FAILURE))) - lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT, - "2858 FLOGI failure Status:x%x/x%x TMO" - ":x%x Data x%lx x%x\n", - ulp_status, ulp_word4, tmo, - phba->hba_flag, phba->fcf.fcf_flag); + lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS, + "2858 FLOGI Status:x%x/x%x TMO" + ":x%x Data x%lx x%x\n", + ulp_status, ulp_word4, tmo, + phba->hba_flag, phba->fcf.fcf_flag); /* Check for retry */ if (lpfc_els_retry(phba, cmdiocb, rspiocb)) { @@ -1023,7 +1023,7 @@ lpfc_cmpl_els_flogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, lpfc_nlp_put(ndlp); lpfc_printf_vlog(vport, KERN_WARNING, LOG_ELS, - "0150 FLOGI failure Status:x%x/x%x " + "0150 FLOGI Status:x%x/x%x " "xri x%x TMO:x%x refcnt %d\n", ulp_status, ulp_word4, cmdiocb->sli4_xritag, tmo, kref_read(&ndlp->kref)); @@ -1032,11 +1032,11 @@ lpfc_cmpl_els_flogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, if (!(ulp_status == IOSTAT_LOCAL_REJECT && ((ulp_word4 & IOERR_PARAM_MASK) == IOERR_LOOP_OPEN_FAILURE))) { - /* FLOGI failure */ - lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT, - "0100 FLOGI failure Status:x%x/x%x " - "TMO:x%x\n", - ulp_status, ulp_word4, tmo); + /* Warn FLOGI status */ + lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS, + "0100 FLOGI Status:x%x/x%x " + "TMO:x%x\n", + ulp_status, ulp_word4, tmo); goto flogifail; } @@ -1964,16 +1964,16 @@ lpfc_cmpl_els_rrq(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, if (ulp_status) { /* Check for retry */ - /* RRQ failed Don't print the vport to vport rjts */ + /* Warn RRQ status Don't print the vport to vport rjts */ if (ulp_status != IOSTAT_LS_RJT || (((ulp_word4) >> 16 != LSRJT_INVALID_CMD) && ((ulp_word4) >> 16 != LSRJT_UNABLE_TPC)) || (phba)->pport->cfg_log_verbose & LOG_ELS) - lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT, - "2881 RRQ failure DID:%06X Status:" - "x%x/x%x\n", - ndlp->nlp_DID, ulp_status, - ulp_word4); + lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS, + "2881 RRQ DID:%06X Status:" + "x%x/x%x\n", + ndlp->nlp_DID, ulp_status, + ulp_word4); } lpfc_clr_rrq_active(phba, rrq->xritag, rrq); @@ -2077,16 +2077,16 @@ lpfc_cmpl_els_plogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, } goto out; } - /* PLOGI failed Don't print the vport to vport rjts */ + /* Warn PLOGI status Don't print the vport to vport rjts */ if (ulp_status != IOSTAT_LS_RJT || (((ulp_word4) >> 16 != LSRJT_INVALID_CMD) && ((ulp_word4) >> 16 != LSRJT_UNABLE_TPC)) || (phba)->pport->cfg_log_verbose & LOG_ELS) - lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT, - "2753 PLOGI failure DID:%06X " - "Status:x%x/x%x\n", - ndlp->nlp_DID, ulp_status, - ulp_word4); + lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS, + "2753 PLOGI DID:%06X " + "Status:x%x/x%x\n", + ndlp->nlp_DID, ulp_status, + ulp_word4); /* Do not call DSM for lpfc_els_abort'ed ELS cmds */ if (!lpfc_error_lost_link(vport, ulp_status, ulp_word4)) @@ -2323,7 +2323,6 @@ lpfc_cmpl_els_prli(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, struct lpfc_vport *vport = cmdiocb->vport; struct lpfc_nodelist *ndlp; char *mode; - u32 loglevel; u32 ulp_status; u32 ulp_word4; bool release_node = false; @@ -2372,17 +2371,14 @@ lpfc_cmpl_els_prli(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, * could be expected. */ if (test_bit(FC_FABRIC, &vport->fc_flag) || - vport->cfg_enable_fc4_type != LPFC_ENABLE_BOTH) { - mode = KERN_ERR; - loglevel = LOG_TRACE_EVENT; - } else { + vport->cfg_enable_fc4_type != LPFC_ENABLE_BOTH) + mode = KERN_WARNING; + else mode = KERN_INFO; - loglevel = LOG_ELS; - } - /* PRLI failed */ - lpfc_printf_vlog(vport, mode, loglevel, - "2754 PRLI failure DID:%06X Status:x%x/x%x, " + /* Warn PRLI status */ + lpfc_printf_vlog(vport, mode, LOG_ELS, + "2754 PRLI DID:%06X Status:x%x/x%x, " "data: x%x x%x x%x\n", ndlp->nlp_DID, ulp_status, ulp_word4, ndlp->nlp_state, @@ -2854,11 +2850,11 @@ lpfc_cmpl_els_adisc(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, } goto out; } - /* ADISC failed */ - lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT, - "2755 ADISC failure DID:%06X Status:x%x/x%x\n", - ndlp->nlp_DID, ulp_status, - ulp_word4); + /* Warn ADISC status */ + lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS, + "2755 ADISC DID:%06X Status:x%x/x%x\n", + ndlp->nlp_DID, ulp_status, + ulp_word4); lpfc_disc_state_machine(vport, ndlp, cmdiocb, NLP_EVT_CMPL_ADISC); @@ -3045,12 +3041,12 @@ lpfc_cmpl_els_logo(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, * discovery. The PLOGI will retry. */ if (ulp_status) { - /* LOGO failed */ - lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT, - "2756 LOGO failure, No Retry DID:%06X " - "Status:x%x/x%x\n", - ndlp->nlp_DID, ulp_status, - ulp_word4); + /* Warn LOGO status */ + lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS, + "2756 LOGO, No Retry DID:%06X " + "Status:x%x/x%x\n", + ndlp->nlp_DID, ulp_status, + ulp_word4); if (lpfc_error_lost_link(vport, ulp_status, ulp_word4)) skip_recovery = 1; @@ -4837,11 +4833,10 @@ lpfc_els_retry(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, if ((phba->sli3_options & LPFC_SLI3_NPIV_ENABLED) && (cmd == ELS_CMD_FDISC) && (stat.un.b.lsRjtRsnCodeExp == LSEXP_OUT_OF_RESOURCE)){ - lpfc_printf_vlog(vport, KERN_ERR, - LOG_TRACE_EVENT, - "0125 FDISC Failed (x%x). " - "Fabric out of resources\n", - stat.un.lsRjtError); + lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS, + "0125 FDISC (x%x). " + "Fabric out of resources\n", + stat.un.lsRjtError); lpfc_vport_set_state(vport, FC_VPORT_NO_FABRIC_RSCS); } @@ -4877,11 +4872,10 @@ lpfc_els_retry(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, LSEXP_NOTHING_MORE) { vport->fc_sparam.cmn.bbRcvSizeMsb &= 0xf; retry = 1; - lpfc_printf_vlog(vport, KERN_ERR, - LOG_TRACE_EVENT, - "0820 FLOGI Failed (x%x). " - "BBCredit Not Supported\n", - stat.un.lsRjtError); + lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS, + "0820 FLOGI (x%x). " + "BBCredit Not Supported\n", + stat.un.lsRjtError); } break; @@ -4891,11 +4885,10 @@ lpfc_els_retry(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, ((stat.un.b.lsRjtRsnCodeExp == LSEXP_INVALID_PNAME) || (stat.un.b.lsRjtRsnCodeExp == LSEXP_INVALID_NPORT_ID)) ) { - lpfc_printf_vlog(vport, KERN_ERR, - LOG_TRACE_EVENT, - "0122 FDISC Failed (x%x). " - "Fabric Detected Bad WWN\n", - stat.un.lsRjtError); + lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS, + "0122 FDISC (x%x). " + "Fabric Detected Bad WWN\n", + stat.un.lsRjtError); lpfc_vport_set_state(vport, FC_VPORT_FABRIC_REJ_WWN); } @@ -5355,8 +5348,8 @@ lpfc_cmpl_els_rsp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, u32 ulp_status, ulp_word4, tmo, did, iotag; if (!vport) { - lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT, - "3177 ELS response failed\n"); + lpfc_printf_log(phba, KERN_WARNING, LOG_ELS, + "3177 null vport in ELS rsp\n"); goto out; } if (cmdiocb->context_un.mbox) @@ -11328,10 +11321,10 @@ lpfc_cmpl_els_fdisc(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, /* Check for retry */ if (lpfc_els_retry(phba, cmdiocb, rspiocb)) goto out; - /* FDISC failed */ - lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT, - "0126 FDISC failed. (x%x/x%x)\n", - ulp_status, ulp_word4); + /* Warn FDISC status */ + lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS, + "0126 FDISC cmpl status: x%x/x%x)\n", + ulp_status, ulp_word4); goto fdisc_failed; } From eeb85c658e1bceaccb6ca3ffc1796741abd7b687 Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Thu, 12 Sep 2024 16:24:46 -0700 Subject: [PATCH 172/352] scsi: lpfc: Support loopback tests with VMID enabled The VMID feature adds an extra application services header to each frame. As such, the loopback test path is updated to accommodate the extra application header. Changes include filling in APPID and WQES bit fields for XMIT_SEQUENCE64 commands, a special loopback source APPID for verifying received loopback data matches what is sent, and increasing ELS WQ size to accommodate the APPID field in loopback test mode. Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20240912232447.45607-8-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_bsg.c | 3 +++ drivers/scsi/lpfc/lpfc_hw.h | 21 +++++++++++++++++++ drivers/scsi/lpfc/lpfc_init.c | 11 ++++++++-- drivers/scsi/lpfc/lpfc_sli.c | 39 +++++++++++++++++++++++++++++++++-- 4 files changed, 70 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_bsg.c b/drivers/scsi/lpfc/lpfc_bsg.c index 4156419c52c78..8738c1b0e3880 100644 --- a/drivers/scsi/lpfc/lpfc_bsg.c +++ b/drivers/scsi/lpfc/lpfc_bsg.c @@ -3208,6 +3208,9 @@ lpfc_bsg_diag_loopback_run(struct bsg_job *job) cmdiocbq->num_bdes = num_bde; cmdiocbq->cmd_flag |= LPFC_IO_LIBDFC; cmdiocbq->cmd_flag |= LPFC_IO_LOOPBACK; + if (phba->cfg_vmid_app_header) + cmdiocbq->cmd_flag |= LPFC_IO_VMID; + cmdiocbq->vport = phba->pport; cmdiocbq->cmd_cmpl = NULL; cmdiocbq->bpl_dmabuf = txbmp; diff --git a/drivers/scsi/lpfc/lpfc_hw.h b/drivers/scsi/lpfc/lpfc_hw.h index 2108b4cb78157..d5c15742f7f29 100644 --- a/drivers/scsi/lpfc/lpfc_hw.h +++ b/drivers/scsi/lpfc/lpfc_hw.h @@ -561,6 +561,27 @@ struct fc_vft_header { #include +/* + * Application Header + */ +struct fc_app_header { + uint32_t dst_app_id; + uint32_t src_app_id; +#define LOOPBACK_SRC_APPID 0x4321 + uint32_t word2; + uint32_t word3; +}; + +/* + * dfctl optional header definition + */ +enum lpfc_fc_dfctl { + LPFC_FC_NO_DEVICE_HEADER, + LPFC_FC_16B_DEVICE_HEADER, + LPFC_FC_32B_DEVICE_HEADER, + LPFC_FC_64B_DEVICE_HEADER, +}; + /* * Extended Link Service LS_COMMAND codes (Payload Word 0) */ diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 0c1404dc5f3bd..48a3dfdd51d39 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -10451,6 +10451,7 @@ lpfc_sli4_queue_create(struct lpfc_hba *phba) struct lpfc_vector_map_info *cpup; struct lpfc_vector_map_info *eqcpup; struct lpfc_eq_intr_info *eqi; + u32 wqesize; /* * Create HBA Record arrays. @@ -10670,9 +10671,15 @@ lpfc_sli4_queue_create(struct lpfc_hba *phba) * Create ELS Work Queues */ - /* Create slow-path ELS Work Queue */ + /* + * Create slow-path ELS Work Queue. + * Increase the ELS WQ size when WQEs contain an embedded cdb + */ + wqesize = (phba->fcp_embed_io) ? + LPFC_WQE128_SIZE : phba->sli4_hba.wq_esize; + qdesc = lpfc_sli4_queue_alloc(phba, LPFC_DEFAULT_PAGE_SIZE, - phba->sli4_hba.wq_esize, + wqesize, phba->sli4_hba.wq_ecount, cpu); if (!qdesc) { lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT, diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index bb5fd3322273e..a44afb84cd3d7 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -11093,9 +11093,17 @@ __lpfc_sli_prep_xmit_seq64_s4(struct lpfc_iocbq *cmdiocbq, /* Word 9 */ bf_set(wqe_rcvoxid, &wqe->xmit_sequence.wqe_com, ox_id); - /* Word 12 */ - if (cmdiocbq->cmd_flag & (LPFC_IO_LIBDFC | LPFC_IO_LOOPBACK)) + if (cmdiocbq->cmd_flag & (LPFC_IO_LIBDFC | LPFC_IO_LOOPBACK)) { + /* Word 10 */ + if (cmdiocbq->cmd_flag & LPFC_IO_VMID) { + bf_set(wqe_appid, &wqe->xmit_sequence.wqe_com, 1); + bf_set(wqe_wqes, &wqe->xmit_sequence.wqe_com, 1); + wqe->words[31] = LOOPBACK_SRC_APPID; + } + + /* Word 12 */ wqe->xmit_sequence.xmit_len = full_size; + } else wqe->xmit_sequence.xmit_len = wqe->xmit_sequence.bde.tus.f.bdeSize; @@ -18434,6 +18442,7 @@ lpfc_fc_frame_check(struct lpfc_hba *phba, struct fc_frame_header *fc_hdr) { /* make rctl_names static to save stack space */ struct fc_vft_header *fc_vft_hdr; + struct fc_app_header *fc_app_hdr; uint32_t *header = (uint32_t *) fc_hdr; #define FC_RCTL_MDS_DIAGS 0xF4 @@ -18489,6 +18498,32 @@ lpfc_fc_frame_check(struct lpfc_hba *phba, struct fc_frame_header *fc_hdr) goto drop; } + if (unlikely(phba->link_flag == LS_LOOPBACK_MODE && + phba->cfg_vmid_app_header)) { + /* Application header is 16B device header */ + if (fc_hdr->fh_df_ctl & LPFC_FC_16B_DEVICE_HEADER) { + fc_app_hdr = (struct fc_app_header *) (fc_hdr + 1); + if (be32_to_cpu(fc_app_hdr->src_app_id) != + LOOPBACK_SRC_APPID) { + lpfc_printf_log(phba, KERN_WARNING, + LOG_ELS | LOG_LIBDFC, + "1932 Loopback src app id " + "not matched, app_id:x%x\n", + be32_to_cpu(fc_app_hdr->src_app_id)); + + goto drop; + } + } else { + lpfc_printf_log(phba, KERN_WARNING, + LOG_ELS | LOG_LIBDFC, + "1933 Loopback df_ctl bit not set, " + "df_ctl:x%x\n", + fc_hdr->fh_df_ctl); + + goto drop; + } + } + lpfc_printf_log(phba, KERN_INFO, LOG_ELS, "2538 Received frame rctl:x%x, type:x%x, " "frame Data:%08x %08x %08x %08x %08x %08x %08x\n", From b071c1a9099c7bc5fe24089117e7a15e52d4198f Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Thu, 12 Sep 2024 16:24:47 -0700 Subject: [PATCH 173/352] scsi: lpfc: Update lpfc version to 14.4.0.5 Update lpfc version to 14.4.0.5 Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20240912232447.45607-9-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_version.h b/drivers/scsi/lpfc/lpfc_version.h index 2fe0386a1feef..e70f163fab900 100644 --- a/drivers/scsi/lpfc/lpfc_version.h +++ b/drivers/scsi/lpfc/lpfc_version.h @@ -20,7 +20,7 @@ * included with this package. * *******************************************************************/ -#define LPFC_DRIVER_VERSION "14.4.0.4" +#define LPFC_DRIVER_VERSION "14.4.0.5" #define LPFC_DRIVER_NAME "lpfc" /* Used for SLI 2/3 */ From ce47f7cbbcadbc716325ccdd3be5d71f1e10a966 Mon Sep 17 00:00:00 2001 From: Chunhui Li Date: Wed, 11 Sep 2024 11:28:02 +0800 Subject: [PATCH 174/352] module: abort module loading when sysfs setup suffer errors When insmod a kernel module, if fails in add_notes_attrs or add_sysfs_attrs such as memory allocation fail, mod_sysfs_setup will still return success, but we can't access user interface on android device. Patch for make mod_sysfs_setup can check the error of add_notes_attrs and add_sysfs_attrs [mcgrof: the section stuff comes from linux history.git [0]] Fixes: 3f7b0672086b ("Module section offsets in /sys/module") [0] Fixes: 6d76013381ed ("Add /sys/module/name/notes") Acked-by: Luis Chamberlain Reviewed-by: Petr Pavlu Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202409010016.3XIFSmRA-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202409072018.qfEzZbO7-lkp@intel.com/ Link: https://git.kernel.org/pub/scm/linux/kernel/git/history/history.git/commit/?id=3f7b0672086b97b2d7f322bdc289cbfa203f10ef [0] Signed-off-by: Xion Wang Signed-off-by: Chunhui Li Signed-off-by: Luis Chamberlain --- kernel/module/sysfs.c | 63 ++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c index 26efe1305c120..456358e1fdc43 100644 --- a/kernel/module/sysfs.c +++ b/kernel/module/sysfs.c @@ -69,12 +69,13 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) kfree(sect_attrs); } -static void add_sect_attrs(struct module *mod, const struct load_info *info) +static int add_sect_attrs(struct module *mod, const struct load_info *info) { unsigned int nloaded = 0, i, size[2]; struct module_sect_attrs *sect_attrs; struct module_sect_attr *sattr; struct bin_attribute **gattr; + int ret; /* Count loaded sections and allocate structures */ for (i = 0; i < info->hdr->e_shnum; i++) @@ -85,7 +86,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); if (!sect_attrs) - return; + return -ENOMEM; /* Setup section attributes. */ sect_attrs->grp.name = "sections"; @@ -103,8 +104,10 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) sattr->address = sec->sh_addr; sattr->battr.attr.name = kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); - if (!sattr->battr.attr.name) + if (!sattr->battr.attr.name) { + ret = -ENOMEM; goto out; + } sect_attrs->nsections++; sattr->battr.read = module_sect_read; sattr->battr.size = MODULE_SECT_READ_SIZE; @@ -113,13 +116,15 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) } *gattr = NULL; - if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) + ret = sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp); + if (ret) goto out; mod->sect_attrs = sect_attrs; - return; + return 0; out: free_sect_attrs(sect_attrs); + return ret; } static void remove_sect_attrs(struct module *mod) @@ -158,15 +163,12 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs, kfree(notes_attrs); } -static void add_notes_attrs(struct module *mod, const struct load_info *info) +static int add_notes_attrs(struct module *mod, const struct load_info *info) { unsigned int notes, loaded, i; struct module_notes_attrs *notes_attrs; struct bin_attribute *nattr; - - /* failed to create section attributes, so can't create notes */ - if (!mod->sect_attrs) - return; + int ret; /* Count notes sections and allocate structures. */ notes = 0; @@ -176,12 +178,12 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info) ++notes; if (notes == 0) - return; + return 0; notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes), GFP_KERNEL); if (!notes_attrs) - return; + return -ENOMEM; notes_attrs->notes = notes; nattr = ¬es_attrs->attrs[0]; @@ -201,19 +203,23 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info) } notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); - if (!notes_attrs->dir) + if (!notes_attrs->dir) { + ret = -ENOMEM; goto out; + } - for (i = 0; i < notes; ++i) - if (sysfs_create_bin_file(notes_attrs->dir, - ¬es_attrs->attrs[i])) + for (i = 0; i < notes; ++i) { + ret = sysfs_create_bin_file(notes_attrs->dir, ¬es_attrs->attrs[i]); + if (ret) goto out; + } mod->notes_attrs = notes_attrs; - return; + return 0; out: free_notes_attrs(notes_attrs, i); + return ret; } static void remove_notes_attrs(struct module *mod) @@ -223,9 +229,15 @@ static void remove_notes_attrs(struct module *mod) } #else /* !CONFIG_KALLSYMS */ -static inline void add_sect_attrs(struct module *mod, const struct load_info *info) { } +static inline int add_sect_attrs(struct module *mod, const struct load_info *info) +{ + return 0; +} static inline void remove_sect_attrs(struct module *mod) { } -static inline void add_notes_attrs(struct module *mod, const struct load_info *info) { } +static inline int add_notes_attrs(struct module *mod, const struct load_info *info) +{ + return 0; +} static inline void remove_notes_attrs(struct module *mod) { } #endif /* CONFIG_KALLSYMS */ @@ -385,11 +397,20 @@ int mod_sysfs_setup(struct module *mod, if (err) goto out_unreg_modinfo_attrs; - add_sect_attrs(mod, info); - add_notes_attrs(mod, info); + err = add_sect_attrs(mod, info); + if (err) + goto out_del_usage_links; + + err = add_notes_attrs(mod, info); + if (err) + goto out_unreg_sect_attrs; return 0; +out_unreg_sect_attrs: + remove_sect_attrs(mod); +out_del_usage_links: + del_usage_links(mod); out_unreg_modinfo_attrs: module_remove_modinfo_attrs(mod, -1); out_unreg_param: From b319cea80539df9bea0ad98cb5e4b2fcb7e1a34b Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Tue, 10 Sep 2024 08:31:23 +0100 Subject: [PATCH 175/352] module: Refine kmemleak scanned areas commit ac3b43283923 ("module: replace module_layout with module_memory") introduced a set of memory regions for the module layout sharing the same attributes. However, it didn't update the kmemleak scanned areas which intended to limit kmemleak scan to sections containing writable data. This means sections such as .text and .rodata are scanned by kmemleak. Refine the scanned areas for modules by limiting it to MOD_TEXT and MOD_INIT_TEXT mod_mem regions. CC: Song Liu Reviewed-by: Catalin Marinas Signed-off-by: Vincent Donnefort Signed-off-by: Luis Chamberlain --- kernel/module/debug_kmemleak.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c index 12a569d361e80..b4cc03842d703 100644 --- a/kernel/module/debug_kmemleak.c +++ b/kernel/module/debug_kmemleak.c @@ -12,19 +12,9 @@ void kmemleak_load_module(const struct module *mod, const struct load_info *info) { - unsigned int i; - - /* only scan the sections containing data */ - kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); - - for (i = 1; i < info->hdr->e_shnum; i++) { - /* Scan all writable sections that's not executable */ - if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) || - !(info->sechdrs[i].sh_flags & SHF_WRITE) || - (info->sechdrs[i].sh_flags & SHF_EXECINSTR)) - continue; - - kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, - info->sechdrs[i].sh_size, GFP_KERNEL); + /* only scan writable, non-executable sections */ + for_each_mod_mem_type(type) { + if (type != MOD_DATA && type != MOD_INIT_DATA) + kmemleak_no_scan(mod->mem[type].base); } } From 9498f2e24ee0133d486667c9fa4c27ecdaadc272 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20H=C3=B6gander?= Date: Fri, 6 Sep 2024 10:00:33 +0300 Subject: [PATCH 176/352] drm/i915/psr: Do not wait for PSR being idle on on Panel Replay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We do not have ALPM on DP Panel Replay. Due to this SRD_STATUS[SRD State] doesn't change from SRDENT_ON after Panel Replay is enabled until it gets disabled. On eDP Panel Replay DEEP_SLEEP is not reached. _psr2_ready_for_pipe_update_locked is waiting DEEP_SLEEP bit getting reset. Take these into account in Panel Replay code by not waiting PSR getting idle after enabling VBI. Fixes: 29fb595d4875 ("drm/i915/psr: Panel replay uses SRD_STATUS to track it's status") Cc: Animesh Manna Signed-off-by: Jouni Högander Reviewed-by: Animesh Manna Link: https://patchwork.freedesktop.org/patch/msgid/20240906070033.289015-5-jouni.hogander@intel.com (cherry picked from commit a2d98feb4b0013ef4f9db0d8f642a8ac1f5ecbb9) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/intel_psr.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index 1f83b3b67ea6c..3d8037b5401ed 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -2784,13 +2784,6 @@ static int _psr1_ready_for_pipe_update_locked(struct intel_dp *intel_dp) EDP_PSR_STATUS_STATE_MASK, 50); } -static int _panel_replay_ready_for_pipe_update_locked(struct intel_dp *intel_dp) -{ - return intel_dp_is_edp(intel_dp) ? - _psr2_ready_for_pipe_update_locked(intel_dp) : - _psr1_ready_for_pipe_update_locked(intel_dp); -} - /** * intel_psr_wait_for_idle_locked - wait for PSR be ready for a pipe update * @new_crtc_state: new CRTC state @@ -2813,12 +2806,10 @@ void intel_psr_wait_for_idle_locked(const struct intel_crtc_state *new_crtc_stat lockdep_assert_held(&intel_dp->psr.lock); - if (!intel_dp->psr.enabled) + if (!intel_dp->psr.enabled || intel_dp->psr.panel_replay_enabled) continue; - if (intel_dp->psr.panel_replay_enabled) - ret = _panel_replay_ready_for_pipe_update_locked(intel_dp); - else if (intel_dp->psr.sel_update_enabled) + if (intel_dp->psr.sel_update_enabled) ret = _psr2_ready_for_pipe_update_locked(intel_dp); else ret = _psr1_ready_for_pipe_update_locked(intel_dp); From fcd33d434d31a210bc9f209b5bfd92f3b91a2dda Mon Sep 17 00:00:00 2001 From: Arun R Murthy Date: Tue, 27 Aug 2024 13:42:05 +0530 Subject: [PATCH 177/352] drm/i915/display: BMG supports UHBR13.5 UHBR20 is not supported by battlemage and the maximum link rate supported is UHBR13.5 v2: Replace IS_DGFX with IS_BATTLEMAGE (Jani) HSD: 16023263677 Signed-off-by: Arun R Murthy Reviewed-by: Mika Kahola Fixes: 98b1c87a5e51 ("drm/i915/xe2hpd: Set maximum DP rate to UHBR13.5") Signed-off-by: Suraj Kandpal Link: https://patchwork.freedesktop.org/patch/msgid/20240827081205.136569-1-arun.r.murthy@intel.com (cherry picked from commit 9c2338ac4543e0fab3a1e0f9f025591e0f0d9f8f) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/intel_dp.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index a1fcedfd404b9..b1b512e258abc 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -531,6 +531,10 @@ static void intel_dp_set_source_rates(struct intel_dp *intel_dp) { /* The values must be in increasing order */ + static const int bmg_rates[] = { + 162000, 216000, 243000, 270000, 324000, 432000, 540000, 675000, + 810000, 1000000, 1350000, + }; static const int mtl_rates[] = { 162000, 216000, 243000, 270000, 324000, 432000, 540000, 675000, 810000, 1000000, 2000000, @@ -561,8 +565,13 @@ intel_dp_set_source_rates(struct intel_dp *intel_dp) intel_dp->source_rates || intel_dp->num_source_rates); if (DISPLAY_VER(dev_priv) >= 14) { - source_rates = mtl_rates; - size = ARRAY_SIZE(mtl_rates); + if (IS_BATTLEMAGE(dev_priv)) { + source_rates = bmg_rates; + size = ARRAY_SIZE(bmg_rates); + } else { + source_rates = mtl_rates; + size = ARRAY_SIZE(mtl_rates); + } max_rate = mtl_max_source_rate(intel_dp); } else if (DISPLAY_VER(dev_priv) >= 11) { source_rates = icl_rates; From ec2231b8dd2dc515912ff7816c420153b4a95e92 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Tue, 10 Sep 2024 14:18:47 +0300 Subject: [PATCH 178/352] drm/i915/dp: Fix AUX IO power enabling for eDP PSR Panel Self Refresh on eDP requires the AUX IO power to be enabled whenever the output (main link) is enabled. This is required by the AUX_PHY_WAKE/ML_PHY_LOCK signaling initiated by the HW automatically to re-enable the main link after it got disabled in power saving states (see eDP v1.4b, sections 5.1, 6.1.3.3.1.1). The Panel Replay mode on non-eDP outputs on the other hand is only supported by keeping the main link active, thus not requiring the above AUX_PHY_WAKE/ML_PHY_LOCK signaling (eDP v1.4b, section 6.1.3.3.1.2). Thus enabling the AUX IO power for this case is not required either. Based on the above enable the AUX IO power only for eDP/PSR outputs. Bspec: 49274, 53370 v2: - Add a TODO comment to adjust the requirement for AUX IO based on whether the ALPM/main-link off mode gets enabled. (Rodrigo) Cc: Animesh Manna Fixes: b8cf5b5d266e ("drm/i915/panelreplay: Initializaton and compute config for panel replay") Reviewed-by: Rodrigo Vivi Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20240910111847.2995725-1-imre.deak@intel.com (cherry picked from commit f7c2ed9d4ce80a2570c492825de239dc8b500f2e) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/intel_ddi.c | 2 +- drivers/gpu/drm/i915/display/intel_psr.c | 19 +++++++++++++++++++ drivers/gpu/drm/i915/display/intel_psr.h | 2 ++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index 00fbe9f8c03a9..b1c294236cc87 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -916,7 +916,7 @@ intel_ddi_main_link_aux_domain(struct intel_digital_port *dig_port, * instead of a specific AUX_IO_ reference without powering up any * extra wells. */ - if (intel_encoder_can_psr(&dig_port->base)) + if (intel_psr_needs_aux_io_power(&dig_port->base, crtc_state)) return intel_display_power_aux_io_domain(i915, dig_port->aux_ch); else if (DISPLAY_VER(i915) < 14 && (intel_crtc_has_dp_encoder(crtc_state) || diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index 3d8037b5401ed..136a0d6ca9707 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -203,6 +203,25 @@ bool intel_encoder_can_psr(struct intel_encoder *encoder) return false; } +bool intel_psr_needs_aux_io_power(struct intel_encoder *encoder, + const struct intel_crtc_state *crtc_state) +{ + /* + * For PSR/PR modes only eDP requires the AUX IO power to be enabled whenever + * the output is enabled. For non-eDP outputs the main link is always + * on, hence it doesn't require the HW initiated AUX wake-up signaling used + * for eDP. + * + * TODO: + * - Consider leaving AUX IO disabled for eDP / PR as well, in case + * the ALPM with main-link off mode is not enabled. + * - Leave AUX IO enabled for DP / PR, once support for ALPM with + * main-link off mode is added for it and this mode gets enabled. + */ + return intel_crtc_has_type(crtc_state, INTEL_OUTPUT_EDP) && + intel_encoder_can_psr(encoder); +} + static bool psr_global_enabled(struct intel_dp *intel_dp) { struct intel_display *display = to_intel_display(intel_dp); diff --git a/drivers/gpu/drm/i915/display/intel_psr.h b/drivers/gpu/drm/i915/display/intel_psr.h index 4e09c10908e4c..6eb5f15f674fa 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.h +++ b/drivers/gpu/drm/i915/display/intel_psr.h @@ -25,6 +25,8 @@ struct intel_plane_state; (intel_dp)->psr.source_panel_replay_support) bool intel_encoder_can_psr(struct intel_encoder *encoder); +bool intel_psr_needs_aux_io_power(struct intel_encoder *encoder, + const struct intel_crtc_state *crtc_state); void intel_psr_init_dpcd(struct intel_dp *intel_dp); void intel_psr_enable_sink(struct intel_dp *intel_dp, const struct intel_crtc_state *crtc_state); From 5bab087507ae99250579f1d36071eb6c867065b1 Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Wed, 7 Aug 2024 17:45:08 +0200 Subject: [PATCH 179/352] selftests: kvm: s390: Add VM run test case Add test case running code interacting with registers within a ucontrol VM. * Add uc_gprs test case The test uses the same VM setup using the fixture and debug macros introduced in earlier patches in this series. Signed-off-by: Christoph Schlameuss Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20240807154512.316936-7-schlameuss@linux.ibm.com [frankja@linux.ibm.com: Removed leftover comment line] Signed-off-by: Janosch Frank Message-ID: <20240807154512.316936-7-schlameuss@linux.ibm.com> --- .../selftests/kvm/s390x/ucontrol_test.c | 125 ++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c index d103a92e7495b..f257beec1430e 100644 --- a/tools/testing/selftests/kvm/s390x/ucontrol_test.c +++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c @@ -7,6 +7,7 @@ * Authors: * Christoph Schlameuss */ +#include "debug_print.h" #include "kselftest_harness.h" #include "kvm_util.h" #include "processor.h" @@ -40,6 +41,23 @@ void require_ucontrol_admin(void) TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_UCONTROL)); } +/* Test program setting some registers and looping */ +extern char test_gprs_asm[]; +asm("test_gprs_asm:\n" + "xgr %r0, %r0\n" + "lgfi %r1,1\n" + "lgfi %r2,2\n" + "lgfi %r3,3\n" + "lgfi %r4,4\n" + "lgfi %r5,5\n" + "lgfi %r6,6\n" + "lgfi %r7,7\n" + "0:\n" + " diag 0,0,0x44\n" + " ahi %r0,1\n" + " j 0b\n" +); + FIXTURE(uc_kvm) { struct kvm_s390_sie_block *sie_block; @@ -204,4 +222,111 @@ TEST(uc_cap_hpage) close(kvm_fd); } +/* verify SIEIC exit + * * fail on codes not expected in the test cases + */ +static bool uc_handle_sieic(FIXTURE_DATA(uc_kvm) * self) +{ + struct kvm_s390_sie_block *sie_block = self->sie_block; + struct kvm_run *run = self->run; + + /* check SIE interception code */ + pr_info("sieic: 0x%.2x 0x%.4x 0x%.4x\n", + run->s390_sieic.icptcode, + run->s390_sieic.ipa, + run->s390_sieic.ipb); + switch (run->s390_sieic.icptcode) { + case ICPT_INST: + /* end execution in caller on intercepted instruction */ + pr_info("sie instruction interception\n"); + return false; + case ICPT_OPEREXC: + /* operation exception */ + TEST_FAIL("sie exception on %.4x%.8x", sie_block->ipa, sie_block->ipb); + default: + TEST_FAIL("UNEXPECTED SIEIC CODE %d", run->s390_sieic.icptcode); + } + return true; +} + +/* verify VM state on exit */ +static bool uc_handle_exit(FIXTURE_DATA(uc_kvm) * self) +{ + struct kvm_run *run = self->run; + + switch (run->exit_reason) { + case KVM_EXIT_S390_SIEIC: + return uc_handle_sieic(self); + default: + pr_info("exit_reason %2d not handled\n", run->exit_reason); + } + return true; +} + +/* run the VM until interrupted */ +static int uc_run_once(FIXTURE_DATA(uc_kvm) * self) +{ + int rc; + + rc = ioctl(self->vcpu_fd, KVM_RUN, NULL); + print_run(self->run, self->sie_block); + print_regs(self->run); + pr_debug("run %d / %d %s\n", rc, errno, strerror(errno)); + return rc; +} + +static void uc_assert_diag44(FIXTURE_DATA(uc_kvm) * self) +{ + struct kvm_s390_sie_block *sie_block = self->sie_block; + + /* assert vm was interrupted by diag 0x0044 */ + TEST_ASSERT_EQ(KVM_EXIT_S390_SIEIC, self->run->exit_reason); + TEST_ASSERT_EQ(ICPT_INST, sie_block->icptcode); + TEST_ASSERT_EQ(0x8300, sie_block->ipa); + TEST_ASSERT_EQ(0x440000, sie_block->ipb); +} + +TEST_F(uc_kvm, uc_gprs) +{ + struct kvm_sync_regs *sync_regs = &self->run->s.regs; + struct kvm_run *run = self->run; + struct kvm_regs regs = {}; + + /* Set registers to values that are different from the ones that we expect below */ + for (int i = 0; i < 8; i++) + sync_regs->gprs[i] = 8; + run->kvm_dirty_regs |= KVM_SYNC_GPRS; + + /* copy test_gprs_asm to code_hva / code_gpa */ + TH_LOG("copy code %p to vm mapped memory %p / %p", + &test_gprs_asm, (void *)self->code_hva, (void *)self->code_gpa); + memcpy((void *)self->code_hva, &test_gprs_asm, PAGE_SIZE); + + /* DAT disabled + 64 bit mode */ + run->psw_mask = 0x0000000180000000ULL; + run->psw_addr = self->code_gpa; + + /* run and expect interception of diag 44 */ + ASSERT_EQ(0, uc_run_once(self)); + ASSERT_EQ(false, uc_handle_exit(self)); + uc_assert_diag44(self); + + /* Retrieve and check guest register values */ + ASSERT_EQ(0, ioctl(self->vcpu_fd, KVM_GET_REGS, ®s)); + for (int i = 0; i < 8; i++) { + ASSERT_EQ(i, regs.gprs[i]); + ASSERT_EQ(i, sync_regs->gprs[i]); + } + + /* run and expect interception of diag 44 again */ + ASSERT_EQ(0, uc_run_once(self)); + ASSERT_EQ(false, uc_handle_exit(self)); + uc_assert_diag44(self); + + /* check continued increment of register 0 value */ + ASSERT_EQ(0, ioctl(self->vcpu_fd, KVM_GET_REGS, ®s)); + ASSERT_EQ(1, regs.gprs[0]); + ASSERT_EQ(1, sync_regs->gprs[0]); +} + TEST_HARNESS_MAIN From f9b56b2c31e5733c04464da1b73bafb9eff6569f Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Wed, 7 Aug 2024 17:45:12 +0200 Subject: [PATCH 180/352] s390: Enable KVM_S390_UCONTROL config in debug_defconfig To simplify testing enable UCONTROL KVM by default in debug kernels. Signed-off-by: Christoph Schlameuss Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20240807154512.316936-11-schlameuss@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20240807154512.316936-11-schlameuss@linux.ibm.com> --- arch/s390/configs/debug_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index ea63a7342f5ff..0c989caed19af 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -59,6 +59,7 @@ CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_S390_HYPFS_FS=y CONFIG_KVM=m +CONFIG_KVM_S390_UCONTROL=y CONFIG_S390_UNWIND_SELFTEST=m CONFIG_S390_KPROBES_SANITY_TEST=m CONFIG_S390_MODULES_SANITY_TEST=m From 21d1d72475809734a5149ecfffdc7551503b042b Mon Sep 17 00:00:00 2001 From: Kent Russell Date: Tue, 10 Sep 2024 09:21:25 -0400 Subject: [PATCH 181/352] drm/amdkfd: Move queue fs deletion after destroy check We were removing the kernfs entry for queue info before checking if the queue could be destroyed. If it failed to get destroyed (e.g. during some GPU resets), then we would try to delete it later during pqm teardown, but the file was already removed. This led to a kernel WARN trying to remove size, gpuid and type. Move the remove to after the destroy check. Signed-off-by: Kent Russell Reviewed-by: Jonathan Kim Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index b439d4d0bd84a..01b960b152743 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -517,7 +517,6 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) if (retval) goto err_destroy_queue; - kfd_procfs_del_queue(pqn->q); dqm = pqn->q->device->dqm; retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); if (retval) { @@ -527,6 +526,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) if (retval != -ETIME) goto err_destroy_queue; } + kfd_procfs_del_queue(pqn->q); kfd_queue_release_buffers(pdd, &pqn->q->properties); pqm_clean_queue_resource(pqm, pqn); uninit_queue(pqn->q); From 52755373ea6197dac40b9804ce967611b5e989bf Mon Sep 17 00:00:00 2001 From: Jesse Zhang Date: Tue, 10 Sep 2024 13:17:30 +0800 Subject: [PATCH 182/352] drm/amdkfd: clean up code for interrupt v10 Variable hub_inst is unused. Fixes: e28604d8337e ("drm/amdkfd: Drop poison hanlding from gfx v10") Signed-off-by: Jesse Zhang Reviewed-by: Tim Huang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c index bb8cbfc39b90f..37b69fe0ede38 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c @@ -306,23 +306,8 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, client_id == SOC15_IH_CLIENTID_UTCL2) { struct kfd_vm_fault_info info = {0}; uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); - uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry); - uint32_t vmid_type = SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry); - int hub_inst = 0; struct kfd_hsa_memory_exception_data exception_data; - /* gfxhub */ - if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) { - hub_inst = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev, - node_id); - if (hub_inst < 0) - hub_inst = 0; - } - - /* mmhub */ - if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC) - hub_inst = node_id / 4; - info.vmid = vmid; info.mc_id = client_id; info.page_addr = ih_ring_entry[4] | From 0da531c82a0fcac65407d28ecdb2a1e19c833df5 Mon Sep 17 00:00:00 2001 From: Tim Huang Date: Wed, 7 Aug 2024 17:33:42 +0800 Subject: [PATCH 183/352] drm/amdgpu: ensure the connector is not null before using it This resolves the dereference null return value warning reported by Coverity. Signed-off-by: Tim Huang Reviewed-by: Jesse Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c index 092ec11258cdd..046d4c4e02990 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c @@ -1474,7 +1474,7 @@ bool amdgpu_display_crtc_scaling_mode_fixup(struct drm_crtc *crtc, if ((!(mode->flags & DRM_MODE_FLAG_INTERLACE)) && ((amdgpu_encoder->underscan_type == UNDERSCAN_ON) || ((amdgpu_encoder->underscan_type == UNDERSCAN_AUTO) && - connector->display_info.is_hdmi && + connector && connector->display_info.is_hdmi && amdgpu_display_is_hdtv_mode(mode)))) { if (amdgpu_encoder->underscan_hborder != 0) amdgpu_crtc->h_border = amdgpu_encoder->underscan_hborder; From c389a0604cfbcdb1f8f53a76560eb31e0700e206 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Mon, 9 Sep 2024 18:51:42 +0800 Subject: [PATCH 184/352] drm/amdgpu: disable GPU RAS bad page feature for specific ASIC The feature is not applicable to specific app platform. v2: update the disablement condition and commit description v3: move the setting to amdgpu_ras_check_supported Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 61a2f386d9fbe..2e4c867e1d424 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3468,6 +3468,11 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) /* aca is disabled by default */ adev->aca.is_enabled = false; + + /* bad page feature is not applicable to specific app platform */ + if (adev->gmc.is_app_apu && + amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0)) + amdgpu_bad_page_threshold = 0; } static void amdgpu_ras_counte_dw(struct work_struct *work) From 28b0ef922738b74335c20b8ed4bf8e259353a3a3 Mon Sep 17 00:00:00 2001 From: Bob Zhou Date: Fri, 6 Sep 2024 17:48:20 +0800 Subject: [PATCH 185/352] drm/amdgpu: Fix missing check pcie_p2p module param The module param pcie_p2p should be checked for kfd p2p feature, so add it. Fixes: 75f0efbc4b3b ("drm/amdgpu: Take IOMMU remapping into account for p2p checks") Signed-off-by: Bob Zhou Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f4628412dac44..afb643e03d3c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -6189,7 +6189,7 @@ bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, p2p_addressable = !(adev->gmc.aper_base & address_mask || aper_limit & address_mask); } - return is_large_bar && p2p_access && p2p_addressable; + return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; #else return false; #endif From 7ab9ebc580617831355843f19224f1e31bb8e983 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 12 Sep 2024 22:38:12 +0200 Subject: [PATCH 186/352] drm/xe/guc: Fix GUC_{SUBMIT,FIRMWARE}_VER helper macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those macros rely on non-existing MAKE_VER_STRUCT macro, while the correct one that should be used is named MAKE_GUC_VER_STRUCT. Fixes: 4eb0aab6e443 ("drm/xe/guc: Bump minimum required GuC version to v70.29.2") Signed-off-by: Michal Wajdeczko Cc: Julia Filipchuk Cc: John Harrison Reviewed-by: Michał Winiarski Link: https://patchwork.freedesktop.org/patch/msgid/20240912203817.1880-2-michal.wajdeczko@intel.com (cherry picked from commit 02fdf821ed79f59c40d766a85947aa7cc25d4364) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_guc.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h index c3e6b51f7a09c..42116b167c985 100644 --- a/drivers/gpu/drm/xe/xe_guc.h +++ b/drivers/gpu/drm/xe/xe_guc.h @@ -18,8 +18,10 @@ */ #define MAKE_GUC_VER(maj, min, pat) (((maj) << 16) | ((min) << 8) | (pat)) #define MAKE_GUC_VER_STRUCT(ver) MAKE_GUC_VER((ver).major, (ver).minor, (ver).patch) -#define GUC_SUBMIT_VER(guc) MAKE_VER_STRUCT((guc)->fw.versions.found[XE_UC_FW_VER_COMPATIBILITY]) -#define GUC_FIRMWARE_VER(guc) MAKE_VER_STRUCT((guc)->fw.versions.found[XE_UC_FW_VER_RELEASE]) +#define GUC_SUBMIT_VER(guc) \ + MAKE_GUC_VER_STRUCT((guc)->fw.versions.found[XE_UC_FW_VER_COMPATIBILITY]) +#define GUC_FIRMWARE_VER(guc) \ + MAKE_GUC_VER_STRUCT((guc)->fw.versions.found[XE_UC_FW_VER_RELEASE]) struct drm_printer; From ee06c09ded3c2f722be4e240ed06287e23596bda Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Mon, 16 Sep 2024 09:49:12 +0100 Subject: [PATCH 187/352] drm/xe/vram: fix ccs offset calculation Spec says SW is expected to round up to the nearest 128K, if not already aligned for the CC unit view of CCS. We are seeing the assert sometimes pop on BMG to tell us that there is a hole between GSM and CCS, as well as popping other asserts with having a vram size with strange alignment, which is likely caused by misaligned offset here. v2 (Shuicheng): - Do the round_up() on final SW address. BSpec: 68023 Fixes: b5c2ca0372dc ("drm/xe/xe2hpg: Determine flat ccs offset for vram") Signed-off-by: Matthew Auld Cc: Himal Prasad Ghimiray Cc: Akshata Jahagirdar Cc: Lucas De Marchi Cc: Shuicheng Lin Cc: Matt Roper Cc: stable@vger.kernel.org # v6.10+ Reviewed-by: Himal Prasad Ghimiray Tested-by: Shuicheng Lin Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240916084911.13119-2-matthew.auld@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 37173392741c425191b959acb3adf70c9a4610c0) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_vram.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/xe/xe_vram.c b/drivers/gpu/drm/xe/xe_vram.c index 5bcd59190353e..80ba2fc78837a 100644 --- a/drivers/gpu/drm/xe/xe_vram.c +++ b/drivers/gpu/drm/xe/xe_vram.c @@ -182,6 +182,7 @@ static inline u64 get_flat_ccs_offset(struct xe_gt *gt, u64 tile_size) offset = offset_hi << 32; /* HW view bits 39:32 */ offset |= offset_lo << 6; /* HW view bits 31:6 */ offset *= num_enabled; /* convert to SW view */ + offset = round_up(offset, SZ_128K); /* SW must round up to nearest 128K */ /* We don't expect any holes */ xe_assert_msg(xe, offset == (xe_mmio_read64_2x32(gt, GSMBASE) - ccs_size), From 99b1f7493bfa757b03d41ee6d7f7d00f81fcba5d Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Wed, 11 Sep 2024 16:55:27 +0100 Subject: [PATCH 188/352] drm/xe/client: fix deadlock in show_meminfo() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a real deadlock as well as sleeping in atomic() bug in here, if the bo put happens to be the last ref, since bo destruction wants to grab the same spinlock and sleeping locks. Fix that by dropping the ref using xe_bo_put_deferred(), and moving the final commit outside of the lock. Dropping the lock around the put is tricky since the bo can go out of scope and delete itself from the list, making it difficult to navigate to the next list entry. Fixes: 0845233388f8 ("drm/xe: Implement fdinfo memory stats printing") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/2727 Signed-off-by: Matthew Auld Cc: Himal Prasad Ghimiray Cc: Tejas Upadhyay Cc: "Thomas Hellström" Cc: # v6.8+ Reviewed-by: Matthew Brost Reviewed-by: Tejas Upadhyay Link: https://patchwork.freedesktop.org/patch/msgid/20240911155527.178910-5-matthew.auld@intel.com (cherry picked from commit 0083b8e6f11d7662283a267d4ce7c966812ffd8a) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_drm_client.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c index e64f4b645e2ea..badfa045ead80 100644 --- a/drivers/gpu/drm/xe/xe_drm_client.c +++ b/drivers/gpu/drm/xe/xe_drm_client.c @@ -196,6 +196,7 @@ static void show_meminfo(struct drm_printer *p, struct drm_file *file) struct xe_drm_client *client; struct drm_gem_object *obj; struct xe_bo *bo; + LLIST_HEAD(deferred); unsigned int id; u32 mem_type; @@ -215,11 +216,14 @@ static void show_meminfo(struct drm_printer *p, struct drm_file *file) list_for_each_entry(bo, &client->bos_list, client_link) { if (!kref_get_unless_zero(&bo->ttm.base.refcount)) continue; + bo_meminfo(bo, stats); - xe_bo_put(bo); + xe_bo_put_deferred(bo, &deferred); } spin_unlock(&client->bos_lock); + xe_bo_put_commit(&deferred); + for (mem_type = XE_PL_SYSTEM; mem_type < TTM_NUM_MEM_TYPES; ++mem_type) { if (!xe_mem_type_to_name[mem_type]) continue; From 69bbe3adf36de47315498c9384f99a1ff9171694 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Wed, 11 Sep 2024 16:55:28 +0100 Subject: [PATCH 189/352] drm/xe/client: add missing bo locking in show_meminfo() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bo_meminfo() wants to inspect bo state like tt and the ttm resource, however this state can change at any point leading to stuff like NPD and UAF, if the bo lock is not held. Grab the bo lock when calling bo_meminfo(), ensuring we drop any spinlocks first. In the case of object_idr we now also need to hold a ref. v2 (MattB) - Also add xe_bo_assert_held() Fixes: 0845233388f8 ("drm/xe: Implement fdinfo memory stats printing") Signed-off-by: Matthew Auld Cc: Himal Prasad Ghimiray Cc: Tejas Upadhyay Cc: "Thomas Hellström" Cc: # v6.8+ Reviewed-by: Matthew Brost Reviewed-by: Tejas Upadhyay Link: https://patchwork.freedesktop.org/patch/msgid/20240911155527.178910-6-matthew.auld@intel.com (cherry picked from commit 4f63d712fa104c3ebefcb289d1e733e86d8698c7) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_drm_client.c | 39 +++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c index badfa045ead80..95a05c5bc897a 100644 --- a/drivers/gpu/drm/xe/xe_drm_client.c +++ b/drivers/gpu/drm/xe/xe_drm_client.c @@ -10,6 +10,7 @@ #include #include +#include "xe_assert.h" #include "xe_bo.h" #include "xe_bo_types.h" #include "xe_device_types.h" @@ -151,10 +152,13 @@ void xe_drm_client_add_bo(struct xe_drm_client *client, */ void xe_drm_client_remove_bo(struct xe_bo *bo) { + struct xe_device *xe = ttm_to_xe_device(bo->ttm.bdev); struct xe_drm_client *client = bo->client; + xe_assert(xe, !kref_read(&bo->ttm.base.refcount)); + spin_lock(&client->bos_lock); - list_del(&bo->client_link); + list_del_init(&bo->client_link); spin_unlock(&client->bos_lock); xe_drm_client_put(client); @@ -166,6 +170,8 @@ static void bo_meminfo(struct xe_bo *bo, u64 sz = bo->size; u32 mem_type; + xe_bo_assert_held(bo); + if (bo->placement.placement) mem_type = bo->placement.placement->mem_type; else @@ -207,7 +213,20 @@ static void show_meminfo(struct drm_printer *p, struct drm_file *file) idr_for_each_entry(&file->object_idr, obj, id) { struct xe_bo *bo = gem_to_xe_bo(obj); - bo_meminfo(bo, stats); + if (dma_resv_trylock(bo->ttm.base.resv)) { + bo_meminfo(bo, stats); + xe_bo_unlock(bo); + } else { + xe_bo_get(bo); + spin_unlock(&file->table_lock); + + xe_bo_lock(bo, false); + bo_meminfo(bo, stats); + xe_bo_unlock(bo); + + xe_bo_put(bo); + spin_lock(&file->table_lock); + } } spin_unlock(&file->table_lock); @@ -217,7 +236,21 @@ static void show_meminfo(struct drm_printer *p, struct drm_file *file) if (!kref_get_unless_zero(&bo->ttm.base.refcount)) continue; - bo_meminfo(bo, stats); + if (dma_resv_trylock(bo->ttm.base.resv)) { + bo_meminfo(bo, stats); + xe_bo_unlock(bo); + } else { + spin_unlock(&client->bos_lock); + + xe_bo_lock(bo, false); + bo_meminfo(bo, stats); + xe_bo_unlock(bo); + + spin_lock(&client->bos_lock); + /* The bo ref will prevent this bo from being removed from the list */ + xe_assert(xef->xe, !list_empty(&bo->client_link)); + } + xe_bo_put_deferred(bo, &deferred); } spin_unlock(&client->bos_lock); From 73d10c7788f6d2b7badf9973afbdea7ca433c15d Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Wed, 11 Sep 2024 16:55:29 +0100 Subject: [PATCH 190/352] drm/xe/client: use mem_type from the current resource MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rather extract the mem_type from the current resource. Checking the first potential placement doesn't really tell us where the bo is currently allocated, especially if there are multiple potential placements. Signed-off-by: Matthew Auld Cc: Himal Prasad Ghimiray Cc: Tejas Upadhyay Cc: "Thomas Hellström" Reviewed-by: Matthew Brost Reviewed-by: Tejas Upadhyay Link: https://patchwork.freedesktop.org/patch/msgid/20240911155527.178910-7-matthew.auld@intel.com (cherry picked from commit fbd73b7d2ae29ef0f604f376bcc22b886a49329e) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_drm_client.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c index 95a05c5bc897a..c4add8b38bbd2 100644 --- a/drivers/gpu/drm/xe/xe_drm_client.c +++ b/drivers/gpu/drm/xe/xe_drm_client.c @@ -168,15 +168,10 @@ static void bo_meminfo(struct xe_bo *bo, struct drm_memory_stats stats[TTM_NUM_MEM_TYPES]) { u64 sz = bo->size; - u32 mem_type; + u32 mem_type = bo->ttm.resource->mem_type; xe_bo_assert_held(bo); - if (bo->placement.placement) - mem_type = bo->placement.placement->mem_type; - else - mem_type = XE_PL_TT; - if (drm_gem_object_is_shared_for_memory_stats(&bo->ttm.base)) stats[mem_type].shared += sz; else From ddc73c465628ab3e60f7eb5b4063b644c18b6336 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Wed, 11 Sep 2024 16:55:30 +0100 Subject: [PATCH 191/352] drm/xe/bo: add some annotations in bo_put() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the put() triggers bo destroy then there is at least one potential sleeping lock. Also annotate bos_lock and ggtt lock. Signed-off-by: Matthew Auld Cc: Himal Prasad Ghimiray Cc: Tejas Upadhyay Cc: "Thomas Hellström" Reviewed-by: Matthew Brost Reviewed-by: Tejas Upadhyay Link: https://patchwork.freedesktop.org/patch/msgid/20240911155527.178910-8-matthew.auld@intel.com (cherry picked from commit 3b04c2cfd71c54117237c72f2a08ff0ae1f602e2) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_bo.c | 14 ++++++++++++++ drivers/gpu/drm/xe/xe_bo.h | 6 +----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 06911e9a3bf51..f379df3a12bfa 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -2320,6 +2320,20 @@ void xe_bo_put_commit(struct llist_head *deferred) drm_gem_object_free(&bo->ttm.base.refcount); } +void xe_bo_put(struct xe_bo *bo) +{ + might_sleep(); + if (bo) { +#ifdef CONFIG_PROC_FS + if (bo->client) + might_lock(&bo->client->bos_lock); +#endif + if (bo->ggtt_node && bo->ggtt_node->ggtt) + might_lock(&bo->ggtt_node->ggtt->lock); + drm_gem_object_put(&bo->ttm.base); + } +} + /** * xe_bo_dumb_create - Create a dumb bo as backing for a fb * @file_priv: ... diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h index dbfb3209615df..6e4be52306dfc 100644 --- a/drivers/gpu/drm/xe/xe_bo.h +++ b/drivers/gpu/drm/xe/xe_bo.h @@ -126,11 +126,7 @@ static inline struct xe_bo *xe_bo_get(struct xe_bo *bo) return bo; } -static inline void xe_bo_put(struct xe_bo *bo) -{ - if (bo) - drm_gem_object_put(&bo->ttm.base); -} +void xe_bo_put(struct xe_bo *bo); static inline void __xe_bo_unset_bulk_move(struct xe_bo *bo) { From 9460f4bd5970f2e46fe190a0cb9814697bd7f21a Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Tue, 10 Sep 2024 18:18:20 -0700 Subject: [PATCH 192/352] drm/xe: Do not run GPU page fault handler on a closed VM Closing a VM removes page table memory thus we shouldn't touch page tables when a VM is closed. Do not run the GPU page fault handler once the VM is closed to avoid touching page tables. Signed-off-by: Matthew Brost Reviewed-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20240911011820.825127-1-matthew.brost@intel.com (cherry picked from commit f96dbf7c321d70834d46f3aedb75a671e839b51e) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_gt_pagefault.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c index 730eec07795e2..00af059a8971a 100644 --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c @@ -212,6 +212,12 @@ static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf) * TODO: Change to read lock? Using write lock for simplicity. */ down_write(&vm->lock); + + if (xe_vm_is_closed(vm)) { + err = -ENOENT; + goto unlock_vm; + } + vma = lookup_vma(vm, pf->page_addr); if (!vma) { err = -EINVAL; From 6c10ba06bb1b48acce6d4d9c1e33beb9954f1788 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Roberto=20de=20Souza?= Date: Thu, 12 Sep 2024 08:38:42 -0700 Subject: [PATCH 193/352] drm/xe/oa: Fix overflow in oa batch buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By default xe_bb_create_job() appends a MI_BATCH_BUFFER_END to batch buffer, this is not a problem if batch buffer is only used once but oa reuses the batch buffer for the same metric and at each call it appends a MI_BATCH_BUFFER_END, printing the warning below and then overflowing. [ 381.072016] ------------[ cut here ]------------ [ 381.072019] xe 0000:00:02.0: [drm] Assertion `bb->len * 4 + bb_prefetch(q->gt) <= size` failed! platform: LUNARLAKE subplatform: 1 graphics: Xe2_LPG / Xe2_HPG 20.04 step B0 media: Xe2_LPM / Xe2_HPM 20.00 step B0 tile: 0 VRAM 0 B GT: 0 type 1 So here checking if batch buffer already have MI_BATCH_BUFFER_END if not append it. v2: - simply fix, suggestion from Ashutosh Cc: Ashutosh Dixit Signed-off-by: José Roberto de Souza Reviewed-by: Ashutosh Dixit Link: https://patchwork.freedesktop.org/patch/msgid/20240912153842.35813-1-jose.souza@intel.com (cherry picked from commit 9ba0e0f30ca42a98af3689460063edfb6315718a) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_bb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_bb.c b/drivers/gpu/drm/xe/xe_bb.c index a13e0b3a169ed..ef777dbdf4ecc 100644 --- a/drivers/gpu/drm/xe/xe_bb.c +++ b/drivers/gpu/drm/xe/xe_bb.c @@ -65,7 +65,8 @@ __xe_bb_create_job(struct xe_exec_queue *q, struct xe_bb *bb, u64 *addr) { u32 size = drm_suballoc_size(bb->bo); - bb->cs[bb->len++] = MI_BATCH_BUFFER_END; + if (bb->len == 0 || bb->cs[bb->len - 1] != MI_BATCH_BUFFER_END) + bb->cs[bb->len++] = MI_BATCH_BUFFER_END; xe_gt_assert(q->gt, bb->len * 4 + bb_prefetch(q->gt) <= size); From bfc00a7754c40544c7446d3b664049d6e00ee0bd Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 10 Sep 2024 09:33:30 -0400 Subject: [PATCH 194/352] drm/amdgpu/gfx9.4.3: drop extra wrapper Drop wrapper used in one place. gfx_v9_4_3_xcc_cp_enable() is used in one place. gfx_v9_4_3_xcc_cp_compute_enable() is used everywhere else. Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 408e5600bb617..b940d2ad57dba 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -2299,12 +2299,6 @@ static int gfx_v9_4_3_cp_resume(struct amdgpu_device *adev) return 0; } -static void gfx_v9_4_3_xcc_cp_enable(struct amdgpu_device *adev, bool enable, - int xcc_id) -{ - gfx_v9_4_3_xcc_cp_compute_enable(adev, enable, xcc_id); -} - static void gfx_v9_4_3_xcc_fini(struct amdgpu_device *adev, int xcc_id) { if (amdgpu_gfx_disable_kcq(adev, xcc_id)) @@ -2336,7 +2330,7 @@ static void gfx_v9_4_3_xcc_fini(struct amdgpu_device *adev, int xcc_id) } gfx_v9_4_3_xcc_kcq_fini_register(adev, xcc_id); - gfx_v9_4_3_xcc_cp_enable(adev, false, xcc_id); + gfx_v9_4_3_xcc_cp_compute_enable(adev, false, xcc_id); } static int gfx_v9_4_3_hw_init(void *handle) From 902b4027216aeaabe1562e1db070550c06f3ec89 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 10 Sep 2024 10:19:43 -0400 Subject: [PATCH 195/352] drm/amdgpu: fix spelling in amd_shared.h Fix spelling in documentation. Reviewed-by: Kent Russell Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/include/amd_shared.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h index 745fd052840dc..3f91926a50e99 100644 --- a/drivers/gpu/drm/amd/include/amd_shared.h +++ b/drivers/gpu/drm/amd/include/amd_shared.h @@ -85,7 +85,7 @@ enum amd_apu_flags { * @AMD_IP_BLOCK_TYPE_MES: Micro-Engine Scheduler * @AMD_IP_BLOCK_TYPE_JPEG: JPEG Engine * @AMD_IP_BLOCK_TYPE_VPE: Video Processing Engine -* @AMD_IP_BLOCK_TYPE_UMSCH_MM: User Mode Schduler for Multimedia +* @AMD_IP_BLOCK_TYPE_UMSCH_MM: User Mode Scheduler for Multimedia * @AMD_IP_BLOCK_TYPE_ISP: Image Signal Processor * @AMD_IP_BLOCK_TYPE_NUM: Total number of IP block types */ From 0110ac11952f06419d267f51a3989e989b17e67a Mon Sep 17 00:00:00 2001 From: Yan Zhen Date: Wed, 11 Sep 2024 12:27:38 +0800 Subject: [PATCH 196/352] drm/amdgpu: fix typo in the comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Correctly spelled comments make it easier for the reader to understand the code. Replace 'udpate' with 'update' in the comment & replace 'recieved' with 'received' in the comment & replace 'dsiable' with 'disable' in the comment & replace 'Initiailize' with 'Initialize' in the comment & replace 'disble' with 'disable' in the comment & replace 'Disbale' with 'Disable' in the comment & replace 'enogh' with 'enough' in the comment & replace 'availabe' with 'available' in the comment. Acked-by: Christian König Signed-off-by: Yan Zhen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- drivers/gpu/drm/amd/amdgpu/imu_v11_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c | 2 +- drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/smuio_v9_0.c | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index 57bda66e85ef7..2ca1271731357 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -511,7 +511,7 @@ static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *h return -EINVAL; } - /* udpate aca bank to aca source error_cache first */ + /* update aca bank to aca source error_cache first */ ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, qctx, NULL); if (ret) return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 189574d53ebd3..e9e599ff3bd48 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -2853,7 +2853,7 @@ static int psp_load_non_psp_fw(struct psp_context *psp) if (ret) return ret; - /* Start rlc autoload after psp recieved all the gfx firmware */ + /* Start rlc autoload after psp received all the gfx firmware */ if (psp->autoload_supported && ucode->ucode_id == (amdgpu_sriov_vf(adev) ? adev->virt.autoload_ucode_id : AMDGPU_UCODE_ID_RLC_G)) { ret = psp_rlc_autoload_start(psp); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2e4c867e1d424..1a1395c5fff15 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -882,7 +882,7 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, if (ret) return ret; - /* gfx block ras dsiable cmd must send to ras-ta */ + /* gfx block ras disable cmd must send to ras-ta */ if (head->block == AMDGPU_RAS_BLOCK__GFX) con->features |= BIT(head->block); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index b8bc7fa8c3750..74adb983ab03e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1970,7 +1970,7 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) DRM_INFO("amdgpu: %uM of GTT memory ready.\n", (unsigned int)(gtt_size / (1024 * 1024))); - /* Initiailize doorbell pool on PCI BAR */ + /* Initialize doorbell pool on PCI BAR */ r = amdgpu_ttm_init_on_chip(adev, AMDGPU_PL_DOORBELL, adev->doorbell.size / PAGE_SIZE); if (r) { DRM_ERROR("Failed initializing doorbell heap.\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c b/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c index 6c1891889c4da..d4f72e47ae9e2 100644 --- a/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c @@ -153,7 +153,7 @@ static void imu_v11_0_setup(struct amdgpu_device *adev) WREG32_SOC15(GC, 0, regGFX_IMU_C2PMSG_16, imu_reg_val); } - //disble imu Rtavfs, SmsRepair, DfllBTC, and ClkB + //disable imu Rtavfs, SmsRepair, DfllBTC, and ClkB imu_reg_val = RREG32_SOC15(GC, 0, regGFX_IMU_SCRATCH_10); imu_reg_val |= 0x10007; WREG32_SOC15(GC, 0, regGFX_IMU_SCRATCH_10, imu_reg_val); diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c b/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c index fa479dfa1ec15..739fce4fa8fdf 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c @@ -365,7 +365,7 @@ static void nbio_v2_3_enable_aspm(struct amdgpu_device *adev, data &= ~PCIE_LC_CNTL__LC_PMI_TO_L1_DIS_MASK; } else { - /* Disbale ASPM L1 */ + /* Disable ASPM L1 */ data &= ~PCIE_LC_CNTL__LC_L1_INACTIVITY_MASK; /* Disable ASPM TxL0s */ data &= ~PCIE_LC_CNTL__LC_L0S_INACTIVITY_MASK; diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c index aa637541da584..e65194fe94af6 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c @@ -710,7 +710,7 @@ static int sdma_v3_0_gfx_resume(struct amdgpu_device *adev) upper_32_bits(wptr_gpu_addr)); wptr_poll_cntl = RREG32(mmSDMA0_GFX_RB_WPTR_POLL_CNTL + sdma_offsets[i]); if (ring->use_pollmem) { - /*wptr polling is not enogh fast, directly clean the wptr register */ + /*wptr polling is not enough fast, directly clean the wptr register */ WREG32(mmSDMA0_GFX_RB_WPTR + sdma_offsets[i], 0); wptr_poll_cntl = REG_SET_FIELD(wptr_poll_cntl, SDMA0_GFX_RB_WPTR_POLL_CNTL, diff --git a/drivers/gpu/drm/amd/amdgpu/smuio_v9_0.c b/drivers/gpu/drm/amd/amdgpu/smuio_v9_0.c index e4e30b9d481b4..c04fdd2d5b389 100644 --- a/drivers/gpu/drm/amd/amdgpu/smuio_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/smuio_v9_0.c @@ -60,7 +60,7 @@ static void smuio_v9_0_get_clock_gating_state(struct amdgpu_device *adev, u64 *f { u32 data; - /* CGTT_ROM_CLK_CTRL0 is not availabe for APUs */ + /* CGTT_ROM_CLK_CTRL0 is not available for APUs */ if (adev->flags & AMD_IS_APU) return; From c400ec6990fb04a2ec9929b253dafa7e77c7f555 Mon Sep 17 00:00:00 2001 From: Andrew Kreimer Date: Tue, 10 Sep 2024 01:41:05 +0300 Subject: [PATCH 197/352] drm/amdgpu: Fix a typo Fix a typo in comments. Reported-by: Matthew Wilcox Signed-off-by: Andrew Kreimer Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index aab8077e50988..f28f6b4ba765d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -58,7 +58,7 @@ #define EEPROM_I2C_MADDR_4 0x40000 /* - * The 2 macros bellow represent the actual size in bytes that + * The 2 macros below represent the actual size in bytes that * those entities occupy in the EEPROM memory. * RAS_TABLE_RECORD_SIZE is different than sizeof(eeprom_table_record) which * uses uint64 to store 6b fields such as retired_page. From c77a46bebe99e70600e636a3a1f285637c479f46 Mon Sep 17 00:00:00 2001 From: Yan Zhen Date: Thu, 12 Sep 2024 15:12:09 +0800 Subject: [PATCH 198/352] drm/amd/display: fix typo in the comment Correctly spelled comments make it easier for the reader to understand the code. Replace 'maxium' with 'maximum' in the comment & replace 'diffculty' with 'difficulty' in the comment & replace 'suppluy' with 'supply' in the comment & replace 'Congiuration' with 'Configuration' in the comment & replace 'eanbled' with 'enabled' in the comment. Signed-off-by: Yan Zhen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/basics/dce_calcs.c | 2 +- drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c | 6 +++--- drivers/gpu/drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/basics/dce_calcs.c b/drivers/gpu/drm/amd/display/dc/basics/dce_calcs.c index e47e9db062f44..681799468487c 100644 --- a/drivers/gpu/drm/amd/display/dc/basics/dce_calcs.c +++ b/drivers/gpu/drm/amd/display/dc/basics/dce_calcs.c @@ -569,7 +569,7 @@ static void calculate_bandwidth( break; } data->lb_partitions[i] = bw_floor2(bw_div(data->lb_size_per_component[i], data->lb_line_pitch), bw_int_to_fixed(1)); - /*clamp the partitions to the maxium number supported by the lb*/ + /* clamp the partitions to the maximum number supported by the lb */ if ((surface_type[i] != bw_def_graphics || dceip->graphics_lb_nodownscaling_multi_line_prefetching == 1)) { data->lb_partitions_max[i] = bw_int_to_fixed(10); } diff --git a/drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c b/drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c index 547dfcc80fde4..d851c081e3768 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c @@ -8926,7 +8926,7 @@ void dml_core_mode_programming(struct display_mode_lib_st *mode_lib, const struc // The prefetch scheduling should only be calculated once as per AllowForPStateChangeOrStutterInVBlank requirement // If the AllowForPStateChangeOrStutterInVBlank requirement is not strict (i.e. only try those power saving feature - // if possible, then will try to program for the best power saving features in order of diffculty (dram, fclk, stutter) + // if possible, then will try to program for the best power saving features in order of difficulty (dram, fclk, stutter) s->iteration = 0; s->MaxTotalRDBandwidth = 0; s->AllPrefetchModeTested = false; @@ -9977,7 +9977,7 @@ void dml_core_get_row_heights( dml_print("DML_DLG: %s: GPUVMMinPageSizeKBytes = %u\n", __func__, GPUVMMinPageSizeKBytes); #endif - // just suppluy with enough parameters to calculate meta and dte + // just supply with enough parameters to calculate meta and dte CalculateVMAndRowBytes( 0, // dml_bool_t ViewportStationary, 1, // dml_bool_t DCCEnable, @@ -10110,7 +10110,7 @@ dml_bool_t dml_mode_support( /// Note: In this function, it is assumed that DCFCLK, SOCCLK freq are the state values, and mode_program will just use the DML calculated DPPCLK and DISPCLK /// @param mode_lib mode_lib data struct that house all the input/output/bbox and calculation values. /// @param state_idx Power state idx chosen -/// @param display_cfg Display Congiuration +/// @param display_cfg Display Configuration /// @param call_standalone Calling mode_programming without calling mode support. Some of the "support" struct member will be pre-calculated before doing mode programming /// TODO: Add clk_cfg input, could be useful for standalone mode dml_bool_t dml_mode_programming( diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c index 42c52284a8680..bded33575493b 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c @@ -455,7 +455,7 @@ bool dcn30_mmhubbub_warmup( struct mcif_wb *mcif_wb; struct mcif_warmup_params warmup_params = {0}; unsigned int i, i_buf; - /*make sure there is no active DWB eanbled */ + /* make sure there is no active DWB enabled */ for (i = 0; i < num_dwb; i++) { dwb = dc->res_pool->dwbc[wb_info[i].dwb_pipe_inst]; if (dwb->dwb_is_efc_transition || dwb->dwb_is_drc) { From 2ed186df27f078eb75c52d09e04aa7b0f9920f57 Mon Sep 17 00:00:00 2001 From: Kent Russell Date: Tue, 10 Sep 2024 09:32:13 -0400 Subject: [PATCH 199/352] drm/amdgpu: Retry i2c transfer once if it fails on SMU13.0.6 During init, there can be some collisions on the i2c bus that result in the EEPROM read failing. This has been mitigated in the PMFW to a degree, but there is still a small chance that the bus will be busy. When the read fails during RAS init, that disables page retirement altogether, which is obviously not ideal. To try to avoid that situation, set the eeprom_read function to retry once if the first read fails, specifically for smu_v13_0_6. Signed-off-by: Kent Russell Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 9974c9f8135e9..55ed6247eb61f 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2107,8 +2107,12 @@ static int smu_v13_0_6_i2c_xfer(struct i2c_adapter *i2c_adap, } mutex_lock(&adev->pm.mutex); r = smu_v13_0_6_request_i2c_xfer(smu, req); - if (r) - goto fail; + if (r) { + /* Retry once, in case of an i2c collision */ + r = smu_v13_0_6_request_i2c_xfer(smu, req); + if (r) + goto fail; + } for (c = i = 0; i < num_msgs; i++) { if (!(msg[i].flags & I2C_M_RD)) { From 3c75518cf27aa5a7e22e1f8f33339ded3779079b Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 11 Sep 2024 16:24:35 +0800 Subject: [PATCH 200/352] drm/amdgpu/mes12: switch SET_SHADER_DEBUGGER pkt to mes schq pipe The SET_SHADER_DEBUGGER packet must work with the added hardware queue, switch the packet submitting to mes schq pipe. Signed-off-by: Jack Xiao Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.11.x --- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index e499b2857a01e..ef05a41162306 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -479,6 +479,11 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes, union MESAPI__MISC misc_pkt; int pipe; + if (mes->adev->enable_uni_mes) + pipe = AMDGPU_MES_KIQ_PIPE; + else + pipe = AMDGPU_MES_SCHED_PIPE; + memset(&misc_pkt, 0, sizeof(misc_pkt)); misc_pkt.header.type = MES_API_TYPE_SCHEDULER; @@ -513,6 +518,7 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes, misc_pkt.wait_reg_mem.reg_offset2 = input->wrm_reg.reg1; break; case MES_MISC_OP_SET_SHADER_DEBUGGER: + pipe = AMDGPU_MES_SCHED_PIPE; misc_pkt.opcode = MESAPI_MISC__SET_SHADER_DEBUGGER; misc_pkt.set_shader_debugger.process_context_addr = input->set_shader_debugger.process_context_addr; @@ -530,11 +536,6 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes, return -EINVAL; } - if (mes->adev->enable_uni_mes) - pipe = AMDGPU_MES_KIQ_PIPE; - else - pipe = AMDGPU_MES_SCHED_PIPE; - return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &misc_pkt, sizeof(misc_pkt), offsetof(union MESAPI__MISC, api_status)); From 03b5038c0ad069380fab7e251d2bf3f1540d20f4 Mon Sep 17 00:00:00 2001 From: David Belanger Date: Wed, 11 Sep 2024 11:16:50 -0400 Subject: [PATCH 201/352] drm/amdgpu: Fix selfring initialization sequence on soc24 Move enable_doorbell_selfring_aperture from common_hw_init to common_late_init in soc24, otherwise selfring aperture is initialized with an incorrect doorbell aperture base. Port changes from this commit from soc21 to soc24: commit 1c312e816c40 ("drm/amdgpu: Enable doorbell selfring after resize FB BAR") Signed-off-by: David Belanger Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.11.x --- drivers/gpu/drm/amd/amdgpu/soc24.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc24.c b/drivers/gpu/drm/amd/amdgpu/soc24.c index b0c3678cfb31d..fd4c3d4f83879 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc24.c +++ b/drivers/gpu/drm/amd/amdgpu/soc24.c @@ -250,13 +250,6 @@ static void soc24_program_aspm(struct amdgpu_device *adev) adev->nbio.funcs->program_aspm(adev); } -static void soc24_enable_doorbell_aperture(struct amdgpu_device *adev, - bool enable) -{ - adev->nbio.funcs->enable_doorbell_aperture(adev, enable); - adev->nbio.funcs->enable_doorbell_selfring_aperture(adev, enable); -} - const struct amdgpu_ip_block_version soc24_common_ip_block = { .type = AMD_IP_BLOCK_TYPE_COMMON, .major = 1, @@ -454,6 +447,11 @@ static int soc24_common_late_init(void *handle) if (amdgpu_sriov_vf(adev)) xgpu_nv_mailbox_get_irq(adev); + /* Enable selfring doorbell aperture late because doorbell BAR + * aperture will change if resize BAR successfully in gmc sw_init. + */ + adev->nbio.funcs->enable_doorbell_selfring_aperture(adev, true); + return 0; } @@ -491,7 +489,7 @@ static int soc24_common_hw_init(void *handle) adev->df.funcs->hw_init(adev); /* enable the doorbell aperture */ - soc24_enable_doorbell_aperture(adev, true); + adev->nbio.funcs->enable_doorbell_aperture(adev, true); return 0; } @@ -500,8 +498,13 @@ static int soc24_common_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - /* disable the doorbell aperture */ - soc24_enable_doorbell_aperture(adev, false); + /* Disable the doorbell aperture and selfring doorbell aperture + * separately in hw_fini because soc21_enable_doorbell_aperture + * has been removed and there is no need to delay disabling + * selfring doorbell. + */ + adev->nbio.funcs->enable_doorbell_aperture(adev, false); + adev->nbio.funcs->enable_doorbell_selfring_aperture(adev, false); if (amdgpu_sriov_vf(adev)) xgpu_nv_mailbox_put_irq(adev); From 797fb1533315571ff9e55e80154f48cd47f3dbe5 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Sep 2024 13:08:12 -0400 Subject: [PATCH 202/352] drm/amdgpu/gfx9.4.3: set additional bits on MEC halt Need to set the pipe reset and cache invalidation bits on halt otherwise we can get stale state if the CP firmware changes (e.g., on module unload and reload). Tested-by: Amber Lin Reviewed-by: Amber Lin Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index b940d2ad57dba..611659404703d 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -1701,7 +1701,15 @@ static void gfx_v9_4_3_xcc_cp_compute_enable(struct amdgpu_device *adev, WREG32_SOC15_RLC(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, 0); } else { WREG32_SOC15_RLC(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, - (CP_MEC_CNTL__MEC_ME1_HALT_MASK | CP_MEC_CNTL__MEC_ME2_HALT_MASK)); + (CP_MEC_CNTL__MEC_INVALIDATE_ICACHE_MASK | + CP_MEC_CNTL__MEC_ME1_PIPE0_RESET_MASK | + CP_MEC_CNTL__MEC_ME1_PIPE1_RESET_MASK | + CP_MEC_CNTL__MEC_ME1_PIPE2_RESET_MASK | + CP_MEC_CNTL__MEC_ME1_PIPE3_RESET_MASK | + CP_MEC_CNTL__MEC_ME2_PIPE0_RESET_MASK | + CP_MEC_CNTL__MEC_ME2_PIPE1_RESET_MASK | + CP_MEC_CNTL__MEC_ME1_HALT_MASK | + CP_MEC_CNTL__MEC_ME2_HALT_MASK)); adev->gfx.kiq[xcc_id].ring.sched.ready = false; } udelay(50); From c1de938fb7e5edc4c71d33f73e9fc5c77feb02a0 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Sep 2024 16:15:55 -0400 Subject: [PATCH 203/352] drm/amdgpu/gfx9.4.3: Explicitly halt MEC before init Need to make sure it's halted as we don't know what state the GPU may have been left in previously. Tested-by: Amber Lin Acked-by: Amber Lin Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 611659404703d..c100845409f79 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -2248,6 +2248,8 @@ static int gfx_v9_4_3_xcc_cp_resume(struct amdgpu_device *adev, int xcc_id) r = gfx_v9_4_3_xcc_cp_compute_load_microcode(adev, xcc_id); if (r) return r; + } else { + gfx_v9_4_3_xcc_cp_compute_enable(adev, false, xcc_id); } r = gfx_v9_4_3_xcc_kiq_resume(adev, xcc_id); From 7181faaa4703705939580abffaf9cb5d6b50dbb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Tue, 27 Aug 2024 16:12:11 +0200 Subject: [PATCH 204/352] drm/amdgpu: nuke the VM PD/PT shadow handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This was only used as workaround for recovering the page tables after VRAM was lost and is no longer necessary after the function amdgpu_vm_bo_reset_state_machine() started to do the same. Compute never used shadows either, so the only proplematic case left is SVM and that is most likely not recoverable in any way when VRAM is lost. Signed-off-by: Christian König Acked-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 87 +-------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 67 +--------------- drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 21 ----- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 17 ---- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 56 +------------ drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 19 +---- 7 files changed, 6 insertions(+), 265 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index dcd59040c449d..9b1e0ede05a45 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1083,10 +1083,6 @@ struct amdgpu_device { struct amdgpu_virt virt; - /* link all shadow bo */ - struct list_head shadow_list; - struct mutex shadow_list_lock; - /* record hw reset is performed */ bool has_hw_reset; u8 reset_magic[AMDGPU_RESET_MAGIC_NUM]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index afb643e03d3c1..c2394c8b4d6b2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4107,9 +4107,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, spin_lock_init(&adev->mm_stats.lock); spin_lock_init(&adev->wb.lock); - INIT_LIST_HEAD(&adev->shadow_list); - mutex_init(&adev->shadow_list_lock); - INIT_LIST_HEAD(&adev->reset_list); INIT_LIST_HEAD(&adev->ras_list); @@ -5029,80 +5026,6 @@ static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) return 0; } -/** - * amdgpu_device_recover_vram - Recover some VRAM contents - * - * @adev: amdgpu_device pointer - * - * Restores the contents of VRAM buffers from the shadows in GTT. Used to - * restore things like GPUVM page tables after a GPU reset where - * the contents of VRAM might be lost. - * - * Returns: - * 0 on success, negative error code on failure. - */ -static int amdgpu_device_recover_vram(struct amdgpu_device *adev) -{ - struct dma_fence *fence = NULL, *next = NULL; - struct amdgpu_bo *shadow; - struct amdgpu_bo_vm *vmbo; - long r = 1, tmo; - - if (amdgpu_sriov_runtime(adev)) - tmo = msecs_to_jiffies(8000); - else - tmo = msecs_to_jiffies(100); - - dev_info(adev->dev, "recover vram bo from shadow start\n"); - mutex_lock(&adev->shadow_list_lock); - list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { - /* If vm is compute context or adev is APU, shadow will be NULL */ - if (!vmbo->shadow) - continue; - shadow = vmbo->shadow; - - /* No need to recover an evicted BO */ - if (!shadow->tbo.resource || - shadow->tbo.resource->mem_type != TTM_PL_TT || - shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || - shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) - continue; - - r = amdgpu_bo_restore_shadow(shadow, &next); - if (r) - break; - - if (fence) { - tmo = dma_fence_wait_timeout(fence, false, tmo); - dma_fence_put(fence); - fence = next; - if (tmo == 0) { - r = -ETIMEDOUT; - break; - } else if (tmo < 0) { - r = tmo; - break; - } - } else { - fence = next; - } - } - mutex_unlock(&adev->shadow_list_lock); - - if (fence) - tmo = dma_fence_wait_timeout(fence, false, tmo); - dma_fence_put(fence); - - if (r < 0 || tmo <= 0) { - dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); - return -EIO; - } - - dev_info(adev->dev, "recover vram bo from shadow done\n"); - return 0; -} - - /** * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf * @@ -5165,12 +5088,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) return r; - if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { + if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) amdgpu_inc_vram_lost(adev); - r = amdgpu_device_recover_vram(adev); - } - if (r) - return r; /* need to be called during full access so we can't do it later like * bare-metal does. @@ -5569,9 +5488,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, } } - if (!r) - r = amdgpu_device_recover_vram(tmp_adev); - else + if (r) tmp_adev->asic_reset_res = r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index e32161f6b67a3..a987f671b1d53 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -77,24 +77,6 @@ static void amdgpu_bo_user_destroy(struct ttm_buffer_object *tbo) amdgpu_bo_destroy(tbo); } -static void amdgpu_bo_vm_destroy(struct ttm_buffer_object *tbo) -{ - struct amdgpu_device *adev = amdgpu_ttm_adev(tbo->bdev); - struct amdgpu_bo *shadow_bo = ttm_to_amdgpu_bo(tbo), *bo; - struct amdgpu_bo_vm *vmbo; - - bo = shadow_bo->parent; - vmbo = to_amdgpu_bo_vm(bo); - /* in case amdgpu_device_recover_vram got NULL of bo->parent */ - if (!list_empty(&vmbo->shadow_list)) { - mutex_lock(&adev->shadow_list_lock); - list_del_init(&vmbo->shadow_list); - mutex_unlock(&adev->shadow_list_lock); - } - - amdgpu_bo_destroy(tbo); -} - /** * amdgpu_bo_is_amdgpu_bo - check if the buffer object is an &amdgpu_bo * @bo: buffer object to be checked @@ -108,8 +90,7 @@ static void amdgpu_bo_vm_destroy(struct ttm_buffer_object *tbo) bool amdgpu_bo_is_amdgpu_bo(struct ttm_buffer_object *bo) { if (bo->destroy == &amdgpu_bo_destroy || - bo->destroy == &amdgpu_bo_user_destroy || - bo->destroy == &amdgpu_bo_vm_destroy) + bo->destroy == &amdgpu_bo_user_destroy) return true; return false; @@ -722,52 +703,6 @@ int amdgpu_bo_create_vm(struct amdgpu_device *adev, return r; } -/** - * amdgpu_bo_add_to_shadow_list - add a BO to the shadow list - * - * @vmbo: BO that will be inserted into the shadow list - * - * Insert a BO to the shadow list. - */ -void amdgpu_bo_add_to_shadow_list(struct amdgpu_bo_vm *vmbo) -{ - struct amdgpu_device *adev = amdgpu_ttm_adev(vmbo->bo.tbo.bdev); - - mutex_lock(&adev->shadow_list_lock); - list_add_tail(&vmbo->shadow_list, &adev->shadow_list); - vmbo->shadow->parent = amdgpu_bo_ref(&vmbo->bo); - vmbo->shadow->tbo.destroy = &amdgpu_bo_vm_destroy; - mutex_unlock(&adev->shadow_list_lock); -} - -/** - * amdgpu_bo_restore_shadow - restore an &amdgpu_bo shadow - * - * @shadow: &amdgpu_bo shadow to be restored - * @fence: dma_fence associated with the operation - * - * Copies a buffer object's shadow content back to the object. - * This is used for recovering a buffer from its shadow in case of a gpu - * reset where vram context may be lost. - * - * Returns: - * 0 for success or a negative error code on failure. - */ -int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow, struct dma_fence **fence) - -{ - struct amdgpu_device *adev = amdgpu_ttm_adev(shadow->tbo.bdev); - struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; - uint64_t shadow_addr, parent_addr; - - shadow_addr = amdgpu_bo_gpu_offset(shadow); - parent_addr = amdgpu_bo_gpu_offset(shadow->parent); - - return amdgpu_copy_buffer(ring, shadow_addr, parent_addr, - amdgpu_bo_size(shadow), NULL, fence, - true, false, 0); -} - /** * amdgpu_bo_kmap - map an &amdgpu_bo buffer object * @bo: &amdgpu_bo buffer object to be mapped diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h index d7e27957013f3..e42c34a3289d5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h @@ -136,8 +136,6 @@ struct amdgpu_bo_user { struct amdgpu_bo_vm { struct amdgpu_bo bo; - struct amdgpu_bo *shadow; - struct list_head shadow_list; struct amdgpu_vm_bo_base entries[]; }; @@ -275,22 +273,6 @@ static inline bool amdgpu_bo_encrypted(struct amdgpu_bo *bo) return bo->flags & AMDGPU_GEM_CREATE_ENCRYPTED; } -/** - * amdgpu_bo_shadowed - check if the BO is shadowed - * - * @bo: BO to be tested. - * - * Returns: - * NULL if not shadowed or else return a BO pointer. - */ -static inline struct amdgpu_bo *amdgpu_bo_shadowed(struct amdgpu_bo *bo) -{ - if (bo->tbo.type == ttm_bo_type_kernel) - return to_amdgpu_bo_vm(bo)->shadow; - - return NULL; -} - bool amdgpu_bo_is_amdgpu_bo(struct ttm_buffer_object *bo); void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain); @@ -349,9 +331,6 @@ u64 amdgpu_bo_gpu_offset(struct amdgpu_bo *bo); u64 amdgpu_bo_gpu_offset_no_check(struct amdgpu_bo *bo); void amdgpu_bo_get_memory(struct amdgpu_bo *bo, struct amdgpu_mem_stats *stats); -void amdgpu_bo_add_to_shadow_list(struct amdgpu_bo_vm *vmbo); -int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow, - struct dma_fence **fence); uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev, uint32_t domain); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 2452dfa6314fd..985b89242b44f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -465,7 +465,6 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, { uint64_t new_vm_generation = amdgpu_vm_generation(adev, vm); struct amdgpu_vm_bo_base *bo_base; - struct amdgpu_bo *shadow; struct amdgpu_bo *bo; int r; @@ -486,16 +485,10 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, spin_unlock(&vm->status_lock); bo = bo_base->bo; - shadow = amdgpu_bo_shadowed(bo); r = validate(param, bo); if (r) return r; - if (shadow) { - r = validate(param, shadow); - if (r) - return r; - } if (bo->tbo.type != ttm_bo_type_kernel) { amdgpu_vm_bo_moved(bo_base); @@ -2149,10 +2142,6 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev, { struct amdgpu_vm_bo_base *bo_base; - /* shadow bo doesn't have bo base, its validation needs its parent */ - if (bo->parent && (amdgpu_bo_shadowed(bo->parent) == bo)) - bo = bo->parent; - for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) { struct amdgpu_vm *vm = bo_base->vm; @@ -2482,7 +2471,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, root_bo = amdgpu_bo_ref(&root->bo); r = amdgpu_bo_reserve(root_bo, true); if (r) { - amdgpu_bo_unref(&root->shadow); amdgpu_bo_unref(&root_bo); goto error_free_delayed; } @@ -2575,11 +2563,6 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm) vm->last_update = dma_fence_get_stub(); vm->is_compute_context = true; - /* Free the shadow bo for compute VM */ - amdgpu_bo_unref(&to_amdgpu_bo_vm(vm->root.bo)->shadow); - - goto unreserve_bo; - unreserve_bo: amdgpu_bo_unreserve(vm->root.bo); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index a076f43097e48..f78a0434a48fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -383,14 +383,6 @@ int amdgpu_vm_pt_clear(struct amdgpu_device *adev, struct amdgpu_vm *vm, if (r) return r; - if (vmbo->shadow) { - struct amdgpu_bo *shadow = vmbo->shadow; - - r = ttm_bo_validate(&shadow->tbo, &shadow->placement, &ctx); - if (r) - return r; - } - if (!drm_dev_enter(adev_to_drm(adev), &idx)) return -ENODEV; @@ -448,10 +440,7 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm, int32_t xcp_id) { struct amdgpu_bo_param bp; - struct amdgpu_bo *bo; - struct dma_resv *resv; unsigned int num_entries; - int r; memset(&bp, 0, sizeof(bp)); @@ -484,42 +473,7 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm, if (vm->root.bo) bp.resv = vm->root.bo->tbo.base.resv; - r = amdgpu_bo_create_vm(adev, &bp, vmbo); - if (r) - return r; - - bo = &(*vmbo)->bo; - if (vm->is_compute_context || (adev->flags & AMD_IS_APU)) { - (*vmbo)->shadow = NULL; - return 0; - } - - if (!bp.resv) - WARN_ON(dma_resv_lock(bo->tbo.base.resv, - NULL)); - resv = bp.resv; - memset(&bp, 0, sizeof(bp)); - bp.size = amdgpu_vm_pt_size(adev, level); - bp.domain = AMDGPU_GEM_DOMAIN_GTT; - bp.flags = AMDGPU_GEM_CREATE_CPU_GTT_USWC; - bp.type = ttm_bo_type_kernel; - bp.resv = bo->tbo.base.resv; - bp.bo_ptr_size = sizeof(struct amdgpu_bo); - bp.xcp_id_plus1 = xcp_id + 1; - - r = amdgpu_bo_create(adev, &bp, &(*vmbo)->shadow); - - if (!resv) - dma_resv_unlock(bo->tbo.base.resv); - - if (r) { - amdgpu_bo_unref(&bo); - return r; - } - - amdgpu_bo_add_to_shadow_list(*vmbo); - - return 0; + return amdgpu_bo_create_vm(adev, &bp, vmbo); } /** @@ -569,7 +523,6 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev, return 0; error_free_pt: - amdgpu_bo_unref(&pt->shadow); amdgpu_bo_unref(&pt_bo); return r; } @@ -581,17 +534,10 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev, */ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) { - struct amdgpu_bo *shadow; - if (!entry->bo) return; entry->bo->vm_bo = NULL; - shadow = amdgpu_bo_shadowed(entry->bo); - if (shadow) { - ttm_bo_set_bulk_move(&shadow->tbo, NULL); - amdgpu_bo_unref(&shadow); - } ttm_bo_set_bulk_move(&entry->bo->tbo, NULL); spin_lock(&entry->vm->status_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c index 4772fba33285b..46d9fb433ab2a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c @@ -35,16 +35,7 @@ */ static int amdgpu_vm_sdma_map_table(struct amdgpu_bo_vm *table) { - int r; - - r = amdgpu_ttm_alloc_gart(&table->bo.tbo); - if (r) - return r; - - if (table->shadow) - r = amdgpu_ttm_alloc_gart(&table->shadow->tbo); - - return r; + return amdgpu_ttm_alloc_gart(&table->bo.tbo); } /* Allocate a new job for @count PTE updates */ @@ -265,17 +256,13 @@ static int amdgpu_vm_sdma_update(struct amdgpu_vm_update_params *p, if (!p->pages_addr) { /* set page commands needed */ - if (vmbo->shadow) - amdgpu_vm_sdma_set_ptes(p, vmbo->shadow, pe, addr, - count, incr, flags); amdgpu_vm_sdma_set_ptes(p, bo, pe, addr, count, incr, flags); return 0; } /* copy commands needed */ - ndw -= p->adev->vm_manager.vm_pte_funcs->copy_pte_num_dw * - (vmbo->shadow ? 2 : 1); + ndw -= p->adev->vm_manager.vm_pte_funcs->copy_pte_num_dw; /* for padding */ ndw -= 7; @@ -290,8 +277,6 @@ static int amdgpu_vm_sdma_update(struct amdgpu_vm_update_params *p, pte[i] |= flags; } - if (vmbo->shadow) - amdgpu_vm_sdma_copy_ptes(p, vmbo->shadow, pe, nptes); amdgpu_vm_sdma_copy_ptes(p, bo, pe, nptes); pe += nptes * 8; From 151b1813919d4ab932e69ca4032761ee0789b04c Mon Sep 17 00:00:00 2001 From: Asad Kamal Date: Thu, 18 Jul 2024 18:09:17 +0800 Subject: [PATCH 205/352] drm/amd/pm: Update SMUv13.0.6 PMFW headers Update PMFW interface headers for updated metrics table with gfx activity per xcd Signed-off-by: Asad Kamal Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h index 0b3c2f54a3433..822c6425d90e0 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h @@ -123,7 +123,7 @@ typedef enum { VOLTAGE_GUARDBAND_COUNT } GFX_GUARDBAND_e; -#define SMU_METRICS_TABLE_VERSION 0xC +#define SMU_METRICS_TABLE_VERSION 0xD typedef struct __attribute__((packed, aligned(4))) { uint32_t AccumulationCounter; @@ -227,6 +227,10 @@ typedef struct __attribute__((packed, aligned(4))) { // PCIE LINK Speed and width uint32_t PCIeLinkSpeed; uint32_t PCIeLinkWidth; + + // PER XCD ACTIVITY + uint32_t GfxBusy[8]; + uint64_t GfxBusyAcc[8]; } MetricsTableX_t; typedef struct __attribute__((packed, aligned(4))) { From 2ae6cd583c4c86c2b7e879b07effb8ffb10756bc Mon Sep 17 00:00:00 2001 From: Le Ma Date: Tue, 10 Sep 2024 17:53:42 +0800 Subject: [PATCH 206/352] drm/amdgpu: add psp funcs callback to check if aux fw is needed Query pmfw version to determine if aux sos fw needs to be loaded in psp v13.0. v2: refine callback to check if aux_fw loading is needed instead of getting pmfw version barely v3: return the comparison directly Signed-off-by: Le Ma Reviewed-by: Lijo Lazar Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 4 ++++ drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 17 +++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index 74a96516c9138..e8abbbcb43266 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -138,6 +138,7 @@ struct psp_funcs { int (*vbflash_stat)(struct psp_context *psp); int (*fatal_error_recovery_quirk)(struct psp_context *psp); bool (*get_ras_capability)(struct psp_context *psp); + bool (*is_aux_sos_load_required)(struct psp_context *psp); }; struct ta_funcs { @@ -464,6 +465,9 @@ struct amdgpu_psp_funcs { ((psp)->funcs->fatal_error_recovery_quirk ? \ (psp)->funcs->fatal_error_recovery_quirk((psp)) : 0) +#define psp_is_aux_sos_load_required(psp) \ + ((psp)->funcs->is_aux_sos_load_required ? (psp)->funcs->is_aux_sos_load_required((psp)) : 0) + extern const struct amd_ip_funcs psp_ip_funcs; extern const struct amdgpu_ip_block_version psp_v3_1_ip_block; diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c index 1251ee38a6764..51e470e8d67d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c @@ -81,6 +81,8 @@ MODULE_FIRMWARE("amdgpu/psp_14_0_4_ta.bin"); /* memory training timeout define */ #define MEM_TRAIN_SEND_MSG_TIMEOUT_US 3000000 +#define regMP1_PUB_SCRATCH0 0x3b10090 + static int psp_v13_0_init_microcode(struct psp_context *psp) { struct amdgpu_device *adev = psp->adev; @@ -807,6 +809,20 @@ static bool psp_v13_0_get_ras_capability(struct psp_context *psp) } } +static bool psp_v13_0_is_aux_sos_load_required(struct psp_context *psp) +{ + struct amdgpu_device *adev = psp->adev; + u32 pmfw_ver; + + if (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(13, 0, 6)) + return false; + + /* load 4e version of sos if pmfw version less than 85.115.0 */ + pmfw_ver = RREG32(regMP1_PUB_SCRATCH0 / 4); + + return (pmfw_ver < 0x557300); +} + static const struct psp_funcs psp_v13_0_funcs = { .init_microcode = psp_v13_0_init_microcode, .wait_for_bootloader = psp_v13_0_wait_for_bootloader_steady_state, @@ -830,6 +846,7 @@ static const struct psp_funcs psp_v13_0_funcs = { .vbflash_stat = psp_v13_0_vbflash_status, .fatal_error_recovery_quirk = psp_v13_0_fatal_error_recovery_quirk, .get_ras_capability = psp_v13_0_get_ras_capability, + .is_aux_sos_load_required = psp_v13_0_is_aux_sos_load_required, }; void psp_v13_0_set_psp_funcs(struct psp_context *psp) From 2778701b165eda674756537054d460fb4b0cf2e2 Mon Sep 17 00:00:00 2001 From: Le Ma Date: Tue, 10 Sep 2024 20:10:45 +0800 Subject: [PATCH 207/352] drm/amdgpu: load sos binary properly on the basis of pmfw version To be compatible with legacy IFWI, driver needs to carry legacy tOS and query pmfw version to load them accordingly. Add psp_firmware_header_v2_1 to handle the combined sos binary. Double the sos count limit for the case of aux sos fw packed. v2: pass the correct fw_bin_desc to parse_sos_bin_descriptor Signed-off-by: Le Ma Reviewed-by: Lijo Lazar Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 29 ++++++++++++++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h | 11 ++++++++- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index e9e599ff3bd48..0b28b2cf1517d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -3425,9 +3425,11 @@ int psp_init_sos_microcode(struct psp_context *psp, const char *chip_name) const struct psp_firmware_header_v1_2 *sos_hdr_v1_2; const struct psp_firmware_header_v1_3 *sos_hdr_v1_3; const struct psp_firmware_header_v2_0 *sos_hdr_v2_0; - int err = 0; + const struct psp_firmware_header_v2_1 *sos_hdr_v2_1; + int fw_index, fw_bin_count, start_index = 0; + const struct psp_fw_bin_desc *fw_bin; uint8_t *ucode_array_start_addr; - int fw_index = 0; + int err = 0; err = amdgpu_ucode_request(adev, &adev->psp.sos_fw, "amdgpu/%s_sos.bin", chip_name); if (err) @@ -3478,15 +3480,30 @@ int psp_init_sos_microcode(struct psp_context *psp, const char *chip_name) case 2: sos_hdr_v2_0 = (const struct psp_firmware_header_v2_0 *)adev->psp.sos_fw->data; - if (le32_to_cpu(sos_hdr_v2_0->psp_fw_bin_count) >= UCODE_MAX_PSP_PACKAGING) { + fw_bin_count = le32_to_cpu(sos_hdr_v2_0->psp_fw_bin_count); + + if (fw_bin_count >= UCODE_MAX_PSP_PACKAGING) { dev_err(adev->dev, "packed SOS count exceeds maximum limit\n"); err = -EINVAL; goto out; } - for (fw_index = 0; fw_index < le32_to_cpu(sos_hdr_v2_0->psp_fw_bin_count); fw_index++) { - err = parse_sos_bin_descriptor(psp, - &sos_hdr_v2_0->psp_fw_bin[fw_index], + if (sos_hdr_v2_0->header.header_version_minor == 1) { + sos_hdr_v2_1 = (const struct psp_firmware_header_v2_1 *)adev->psp.sos_fw->data; + + fw_bin = sos_hdr_v2_1->psp_fw_bin; + + if (psp_is_aux_sos_load_required(psp)) + start_index = le32_to_cpu(sos_hdr_v2_1->psp_aux_fw_bin_index); + else + fw_bin_count -= le32_to_cpu(sos_hdr_v2_1->psp_aux_fw_bin_index); + + } else { + fw_bin = sos_hdr_v2_0->psp_fw_bin; + } + + for (fw_index = start_index; fw_index < fw_bin_count; fw_index++) { + err = parse_sos_bin_descriptor(psp, fw_bin + fw_index, sos_hdr_v2_0); if (err) goto out; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h index 5bc37acd39819..4e23419b92d4e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h @@ -136,6 +136,14 @@ struct psp_firmware_header_v2_0 { struct psp_fw_bin_desc psp_fw_bin[]; }; +/* version_major=2, version_minor=1 */ +struct psp_firmware_header_v2_1 { + struct common_firmware_header header; + uint32_t psp_fw_bin_count; + uint32_t psp_aux_fw_bin_index; + struct psp_fw_bin_desc psp_fw_bin[]; +}; + /* version_major=1, version_minor=0 */ struct ta_firmware_header_v1_0 { struct common_firmware_header header; @@ -426,6 +434,7 @@ union amdgpu_firmware_header { struct psp_firmware_header_v1_1 psp_v1_1; struct psp_firmware_header_v1_3 psp_v1_3; struct psp_firmware_header_v2_0 psp_v2_0; + struct psp_firmware_header_v2_0 psp_v2_1; struct ta_firmware_header_v1_0 ta; struct ta_firmware_header_v2_0 ta_v2_0; struct gfx_firmware_header_v1_0 gfx; @@ -447,7 +456,7 @@ union amdgpu_firmware_header { uint8_t raw[0x100]; }; -#define UCODE_MAX_PSP_PACKAGING ((sizeof(union amdgpu_firmware_header) - sizeof(struct common_firmware_header) - 4) / sizeof(struct psp_fw_bin_desc)) +#define UCODE_MAX_PSP_PACKAGING (((sizeof(union amdgpu_firmware_header) - sizeof(struct common_firmware_header) - 4) / sizeof(struct psp_fw_bin_desc)) * 2) /* * fw loading support From c03fca619fc687338a3b6511fdbed94096abdf79 Mon Sep 17 00:00:00 2001 From: Robin Chen Date: Fri, 23 Aug 2024 15:00:28 +0800 Subject: [PATCH 208/352] drm/amd/display: Round calculated vtotal [WHY] The calculated vtotal may has 1 line deviation. To get precisely vtotal number, round the vtotal result. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Anthony Koo Signed-off-by: Robin Chen Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/modules/freesync/freesync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c index a40e6590215a6..bbd259cea4f4f 100644 --- a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c +++ b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c @@ -134,7 +134,7 @@ unsigned int mod_freesync_calc_v_total_from_refresh( v_total = div64_u64(div64_u64(((unsigned long long)( frame_duration_in_ns) * (stream->timing.pix_clk_100hz / 10)), - stream->timing.h_total), 1000000); + stream->timing.h_total) + 500000, 1000000); /* v_total cannot be less than nominal */ if (v_total < stream->timing.v_total) { From c2ed7002c0614c5eab6c8f62a7a76be5df5805cf Mon Sep 17 00:00:00 2001 From: Samson Tam Date: Fri, 23 Aug 2024 16:57:33 -0400 Subject: [PATCH 209/352] drm/amd/display: Use SDR white level to calculate matrix coefficients [WHY] Certain profiles have higher HDR multiplier than SDR white level max which is not currently supported. [HOW] Use SDR white level when calculating matrix coefficients for HDR RGB MPO path instead of HDR multiplier. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Jun Lei Signed-off-by: Samson Tam Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 12 ++++++++++++ drivers/gpu/drm/amd/display/dc/dc.h | 3 +++ .../gpu/drm/amd/display/dc/dc_spl_translate.c | 9 +-------- drivers/gpu/drm/amd/display/dc/spl/dc_spl.c | 17 +++++++++++------ .../gpu/drm/amd/display/dc/spl/dc_spl_types.h | 2 +- 5 files changed, 28 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index ae788154896c4..243928b0a39f6 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -2596,6 +2596,12 @@ static enum surface_update_type det_surface_update(const struct dc *dc, elevate_update_type(&overall_type, UPDATE_TYPE_MED); } + if (u->sdr_white_level_nits) + if (u->sdr_white_level_nits != u->surface->sdr_white_level_nits) { + update_flags->bits.sdr_white_level_nits = 1; + elevate_update_type(&overall_type, UPDATE_TYPE_FULL); + } + if (u->cm2_params) { if ((u->cm2_params->component_settings.shaper_3dlut_setting != u->surface->mcm_shaper_3dlut_setting) @@ -2876,6 +2882,10 @@ static void copy_surface_update_to_plane( surface->hdr_mult = srf_update->hdr_mult; + if (srf_update->sdr_white_level_nits) + surface->sdr_white_level_nits = + srf_update->sdr_white_level_nits; + if (srf_update->blend_tf) memcpy(&surface->blend_tf, srf_update->blend_tf, sizeof(surface->blend_tf)); @@ -4679,6 +4689,8 @@ static bool full_update_required(struct dc *dc, srf_updates[i].scaling_info || (srf_updates[i].hdr_mult.value && srf_updates[i].hdr_mult.value != srf_updates->surface->hdr_mult.value) || + (srf_updates[i].sdr_white_level_nits && + srf_updates[i].sdr_white_level_nits != srf_updates->surface->sdr_white_level_nits) || srf_updates[i].in_transfer_func || srf_updates[i].func_shaper || srf_updates[i].lut3d_func || diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 4c94dd38be4bc..dcf8a90e961d3 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -1269,6 +1269,7 @@ union surface_update_flags { uint32_t tmz_changed:1; uint32_t mcm_transfer_function_enable_change:1; /* disable or enable MCM transfer func */ uint32_t full_update:1; + uint32_t sdr_white_level_nits:1; } bits; uint32_t raw; @@ -1351,6 +1352,7 @@ struct dc_plane_state { bool adaptive_sharpness_en; int sharpness_level; enum linear_light_scaling linear_light_scaling; + unsigned int sdr_white_level_nits; }; struct dc_plane_info { @@ -1508,6 +1510,7 @@ struct dc_surface_update { */ struct dc_cm2_parameters *cm2_params; const struct dc_csc_transform *cursor_csc_color_matrix; + unsigned int sdr_white_level_nits; }; /* diff --git a/drivers/gpu/drm/amd/display/dc/dc_spl_translate.c b/drivers/gpu/drm/amd/display/dc/dc_spl_translate.c index cd6de93eb91c3..f711fc2e3e651 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_spl_translate.c +++ b/drivers/gpu/drm/amd/display/dc/dc_spl_translate.c @@ -191,14 +191,7 @@ void translate_SPL_in_params_from_pipe_ctx(struct pipe_ctx *pipe_ctx, struct spl */ spl_in->is_fullscreen = dm_helpers_is_fullscreen(pipe_ctx->stream->ctx, pipe_ctx->stream); spl_in->is_hdr_on = dm_helpers_is_hdr_on(pipe_ctx->stream->ctx, pipe_ctx->stream); - spl_in->hdr_multx100 = 0; - if (spl_in->is_hdr_on) { - spl_in->hdr_multx100 = (uint32_t)dc_fixpt_floor(dc_fixpt_mul(plane_state->hdr_mult, - dc_fixpt_from_int(100))); - /* Disable sharpness for HDR Mult > 6.0 */ - if (spl_in->hdr_multx100 > 600) - spl_in->adaptive_sharpness.enable = false; - } + spl_in->sdr_white_level_nits = plane_state->sdr_white_level_nits; } /// @brief Translate SPL output parameters to pipe context diff --git a/drivers/gpu/drm/amd/display/dc/spl/dc_spl.c b/drivers/gpu/drm/amd/display/dc/spl/dc_spl.c index 15f7eda903e64..a59aa6b596875 100644 --- a/drivers/gpu/drm/amd/display/dc/spl/dc_spl.c +++ b/drivers/gpu/drm/amd/display/dc/spl/dc_spl.c @@ -1155,14 +1155,19 @@ static void spl_set_dscl_prog_data(struct spl_in *spl_in, struct spl_scratch *sp } /* Calculate C0-C3 coefficients based on HDR_mult */ -static void spl_calculate_c0_c3_hdr(struct dscl_prog_data *dscl_prog_data, uint32_t hdr_multx100) +static void spl_calculate_c0_c3_hdr(struct dscl_prog_data *dscl_prog_data, uint32_t sdr_white_level_nits) { struct spl_fixed31_32 hdr_mult, c0_mult, c1_mult, c2_mult; struct spl_fixed31_32 c0_calc, c1_calc, c2_calc; struct spl_custom_float_format fmt; + uint32_t hdr_multx100_int; - SPL_ASSERT(hdr_multx100); - hdr_mult = spl_fixpt_from_fraction((long long)hdr_multx100, 100LL); + if ((sdr_white_level_nits >= 80) && (sdr_white_level_nits <= 480)) + hdr_multx100_int = sdr_white_level_nits * 100 / 80; + else + hdr_multx100_int = 100; /* default for 80 nits otherwise */ + + hdr_mult = spl_fixpt_from_fraction((long long)hdr_multx100_int, 100LL); c0_mult = spl_fixpt_from_fraction(2126LL, 10000LL); c1_mult = spl_fixpt_from_fraction(7152LL, 10000LL); c2_mult = spl_fixpt_from_fraction(722LL, 10000LL); @@ -1191,7 +1196,7 @@ static void spl_calculate_c0_c3_hdr(struct dscl_prog_data *dscl_prog_data, uint3 static void spl_set_easf_data(struct spl_scratch *spl_scratch, struct spl_out *spl_out, bool enable_easf_v, bool enable_easf_h, enum linear_light_scaling lls_pref, enum spl_pixel_format format, enum system_setup setup, - uint32_t hdr_multx100) + uint32_t sdr_white_level_nits) { struct dscl_prog_data *dscl_prog_data = spl_out->dscl_prog_data; if (enable_easf_v) { @@ -1499,7 +1504,7 @@ static void spl_set_easf_data(struct spl_scratch *spl_scratch, struct spl_out *s dscl_prog_data->easf_ltonl_en = 1; // Linear input if ((setup == HDR_L) && (spl_is_rgb8(format))) { /* Calculate C0-C3 coefficients based on HDR multiplier */ - spl_calculate_c0_c3_hdr(dscl_prog_data, hdr_multx100); + spl_calculate_c0_c3_hdr(dscl_prog_data, sdr_white_level_nits); } else { // HDR_L ( DWM ) and SDR_L dscl_prog_data->easf_matrix_c0 = 0x4EF7; // fp1.5.10, C0 coefficient (LN_rec709: 0.2126 * (2^14)/125 = 27.86590720) @@ -1750,7 +1755,7 @@ bool spl_calculate_scaler_params(struct spl_in *spl_in, struct spl_out *spl_out) // Set EASF spl_set_easf_data(&spl_scratch, spl_out, enable_easf_v, enable_easf_h, spl_in->lls_pref, - spl_in->basic_in.format, setup, spl_in->hdr_multx100); + spl_in->basic_in.format, setup, spl_in->sdr_white_level_nits); // Set iSHARP vratio = spl_fixpt_ceil(spl_scratch.scl_data.ratios.vert); diff --git a/drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h b/drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h index 85b19ebe2c576..74f2a8c42f4f8 100644 --- a/drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h +++ b/drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h @@ -518,7 +518,7 @@ struct spl_in { bool is_hdr_on; int h_active; int v_active; - int hdr_multx100; + int sdr_white_level_nits; }; // end of SPL inputs From f588da30a20cf184f150420e4098b694908a4207 Mon Sep 17 00:00:00 2001 From: Ryan Seto Date: Mon, 19 Aug 2024 17:06:56 -0400 Subject: [PATCH 210/352] drm/amd/display: Implement new DPCD register handling [WHY] There are some monitor timings that seem to be supported without DSC but actually require DSC to be displayed. A VESA SCR introduced a new max uncompressed pixel rate cap register that we can use to handle these edge cases. [HOW] SST: Read caps from link and invalidate timings that exceed the max limit but do not support DSC. Then check for options override when determining BPP. MST: Read caps from virtual DPCD peer device or daisy chained SST monitor and set validation set BPPs to max if pixel rate exceeds uncompressed limit. Validation set optimization continues as normal. Reviewed-by: Wenjing Liu Signed-off-by: Ryan Seto Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dc_dp_types.h | 12 ++++++++++++ drivers/gpu/drm/amd/display/dc/dc_dsc.h | 1 + drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c | 10 +++++----- .../gpu/drm/amd/display/dc/link/link_validation.c | 7 +++++++ .../display/dc/link/protocols/link_dp_capability.c | 5 +++++ 5 files changed, 30 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc_dp_types.h b/drivers/gpu/drm/amd/display/dc/dc_dp_types.h index 519c3df78ee5b..41bd95e9177a4 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_dp_types.h +++ b/drivers/gpu/drm/amd/display/dc/dc_dp_types.h @@ -969,6 +969,14 @@ union dp_sink_video_fallback_formats { uint8_t raw; }; +union dpcd_max_uncompressed_pixel_rate_cap { + struct { + uint16_t max_uncompressed_pixel_rate_cap :15; + uint16_t valid :1; + } bits; + uint8_t raw[2]; +}; + union dp_fec_capability1 { struct { uint8_t AGGREGATED_ERROR_COUNTERS_CAPABLE :1; @@ -1170,6 +1178,7 @@ struct dpcd_caps { struct dc_lttpr_caps lttpr_caps; struct adaptive_sync_caps adaptive_sync_caps; struct dpcd_usb4_dp_tunneling_info usb4_dp_tun_info; + union dpcd_max_uncompressed_pixel_rate_cap max_uncompressed_pixel_rate_cap; union dp_128b_132b_supported_link_rates dp_128b_132b_supported_link_rates; union dp_main_line_channel_coding_cap channel_coding_cap; @@ -1340,6 +1349,9 @@ struct dp_trace { #ifndef DP_CABLE_ATTRIBUTES_UPDATED_BY_DPTX #define DP_CABLE_ATTRIBUTES_UPDATED_BY_DPTX 0x110 #endif +#ifndef DPCD_MAX_UNCOMPRESSED_PIXEL_RATE_CAP +#define DPCD_MAX_UNCOMPRESSED_PIXEL_RATE_CAP 0x221c +#endif #ifndef DP_REPEATER_CONFIGURATION_AND_STATUS_SIZE #define DP_REPEATER_CONFIGURATION_AND_STATUS_SIZE 0x50 #endif diff --git a/drivers/gpu/drm/amd/display/dc/dc_dsc.h b/drivers/gpu/drm/amd/display/dc/dc_dsc.h index fe3078b8789ef..2a5120ecf48be 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_dsc.h +++ b/drivers/gpu/drm/amd/display/dc/dc_dsc.h @@ -59,6 +59,7 @@ struct dc_dsc_config_options { uint32_t max_target_bpp_limit_override_x16; uint32_t slice_height_granularity; uint32_t dsc_force_odm_hslice_override; + bool force_dsc_when_not_needed; }; bool dc_dsc_parse_dsc_dpcd(const struct dc *dc, diff --git a/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c b/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c index a1727e5bf0247..79c426425911f 100644 --- a/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c +++ b/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c @@ -668,6 +668,7 @@ static bool decide_dsc_bandwidth_range( */ static bool decide_dsc_target_bpp_x16( const struct dc_dsc_policy *policy, + const struct dc_dsc_config_options *options, const struct dsc_enc_caps *dsc_common_caps, const int target_bandwidth_kbps, const struct dc_crtc_timing *timing, @@ -682,7 +683,7 @@ static bool decide_dsc_target_bpp_x16( if (decide_dsc_bandwidth_range(policy->min_target_bpp * 16, policy->max_target_bpp * 16, num_slices_h, dsc_common_caps, timing, link_encoding, &range)) { if (target_bandwidth_kbps >= range.stream_kbps) { - if (policy->enable_dsc_when_not_needed) + if (policy->enable_dsc_when_not_needed || options->force_dsc_when_not_needed) /* enable max bpp even dsc is not needed */ *target_bpp_x16 = range.max_target_bpp_x16; } else if (target_bandwidth_kbps >= range.max_kbps) { @@ -1080,6 +1081,7 @@ static bool setup_dsc_config( if (target_bandwidth_kbps > 0) { is_dsc_possible = decide_dsc_target_bpp_x16( &policy, + options, &dsc_common_caps, target_bandwidth_kbps, timing, @@ -1235,10 +1237,7 @@ void dc_dsc_get_policy_for_timing(const struct dc_crtc_timing *timing, policy->max_target_bpp = max_target_bpp_limit_override_x16 / 16; /* enable DSC when not needed, default false */ - if (dsc_policy_enable_dsc_when_not_needed) - policy->enable_dsc_when_not_needed = dsc_policy_enable_dsc_when_not_needed; - else - policy->enable_dsc_when_not_needed = false; + policy->enable_dsc_when_not_needed = dsc_policy_enable_dsc_when_not_needed; } void dc_dsc_policy_set_max_target_bpp_limit(uint32_t limit) @@ -1267,4 +1266,5 @@ void dc_dsc_get_default_config_option(const struct dc *dc, struct dc_dsc_config_ options->dsc_force_odm_hslice_override = dc->debug.force_odm_combine; options->max_target_bpp_limit_override_x16 = 0; options->slice_height_granularity = 1; + options->force_dsc_when_not_needed = false; } diff --git a/drivers/gpu/drm/amd/display/dc/link/link_validation.c b/drivers/gpu/drm/amd/display/dc/link/link_validation.c index 1aed55b0ab6a0..60f15a9ba7a5e 100644 --- a/drivers/gpu/drm/amd/display/dc/link/link_validation.c +++ b/drivers/gpu/drm/amd/display/dc/link/link_validation.c @@ -287,6 +287,13 @@ static bool dp_validate_mode_timing( req_bw = dc_bandwidth_in_kbps_from_timing(timing, dc_link_get_highest_encoding_format(link)); max_bw = dp_link_bandwidth_kbps(link, link_setting); + bool is_max_uncompressed_pixel_rate_exceeded = link->dpcd_caps.max_uncompressed_pixel_rate_cap.bits.valid && + timing->pix_clk_100hz > link->dpcd_caps.max_uncompressed_pixel_rate_cap.bits.max_uncompressed_pixel_rate_cap * 10000; + + if (is_max_uncompressed_pixel_rate_exceeded && !timing->flags.DSC) { + return false; + } + if (req_bw <= max_bw) { /* remember the biggest mode here, during * initial link training (to get diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c index 34a618a7278b0..d78c8ec4de79e 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c @@ -1942,6 +1942,11 @@ static bool retrieve_link_cap(struct dc_link *link) DC_LOG_DP2("\tFEC aggregated error counters are supported"); } + core_link_read_dpcd(link, + DPCD_MAX_UNCOMPRESSED_PIXEL_RATE_CAP, + link->dpcd_caps.max_uncompressed_pixel_rate_cap.raw, + sizeof(link->dpcd_caps.max_uncompressed_pixel_rate_cap.raw)); + retrieve_cable_id(link); dpcd_write_cable_id_to_dprx(link); From d18a56b247f4f3f7dbdd3adeeebd05c23f1e3d3e Mon Sep 17 00:00:00 2001 From: Daniel Sa Date: Fri, 23 Aug 2024 11:29:23 -0400 Subject: [PATCH 211/352] drm/amd/display: Emulate Display Hotplug Hang [WHY] Driver reports 0 display when the virtual display is still present, and causes P-state hang in FW. [HOW] When enumerating through streams, check for active planes and use that to indicate number of displays. Reviewed-by: Dillon Varone Signed-off-by: Daniel Sa Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/clk_mgr/clk_mgr.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/clk_mgr.c index f770828df1493..0e243f4344d05 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/clk_mgr.c @@ -59,6 +59,7 @@ int clk_mgr_helper_get_active_display_cnt( display_count = 0; for (i = 0; i < context->stream_count; i++) { const struct dc_stream_state *stream = context->streams[i]; + const struct dc_stream_status *stream_status = &context->stream_status[i]; /* Don't count SubVP phantom pipes as part of active * display count @@ -66,13 +67,7 @@ int clk_mgr_helper_get_active_display_cnt( if (dc_state_get_stream_subvp_type(context, stream) == SUBVP_PHANTOM) continue; - /* - * Only notify active stream or virtual stream. - * Need to notify virtual stream to work around - * headless case. HPD does not fire when system is in - * S0i2. - */ - if (!stream->dpms_off || stream->signal == SIGNAL_TYPE_VIRTUAL) + if (!stream->dpms_off || (stream_status && stream_status->plane_count)) display_count++; } From 5a3d3e11349c2e298c0b6b4d37c8241f44d37e3d Mon Sep 17 00:00:00 2001 From: Roman Li Date: Wed, 21 Aug 2024 10:53:15 -0400 Subject: [PATCH 212/352] drm/amd/display: Add dmub hpd sense callback [WHY] HPD sense notification has been implemented in DMUB, which can occur during low power states and need to be notified from firmware to driver. [HOW] Define callback and register new HPD sense notification. Reviewed-by: Nicholas Kazlauskas Signed-off-by: Roman Li Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 20 +++++++++++++++++++ .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 0cff66735cfeb..4ab329e007147 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -807,6 +807,20 @@ static void dmub_hpd_callback(struct amdgpu_device *adev, } } +/** + * dmub_hpd_sense_callback - DMUB HPD sense processing callback. + * @adev: amdgpu_device pointer + * @notify: dmub notification structure + * + * HPD sense changes can occur during low power states and need to be + * notified from firmware to driver. + */ +static void dmub_hpd_sense_callback(struct amdgpu_device *adev, + struct dmub_notification *notify) +{ + DRM_DEBUG_DRIVER("DMUB HPD SENSE callback.\n"); +} + /** * register_dmub_notify_callback - Sets callback for DMUB notify * @adev: amdgpu_device pointer @@ -3808,6 +3822,12 @@ static int register_hpd_handlers(struct amdgpu_device *adev) DRM_ERROR("amdgpu: fail to register dmub hpd callback"); return -EINVAL; } + + if (!register_dmub_notify_callback(adev, DMUB_NOTIFICATION_HPD_SENSE_NOTIFY, + dmub_hpd_sense_callback, true)) { + DRM_ERROR("amdgpu: fail to register dmub hpd sense callback"); + return -EINVAL; + } } list_for_each_entry(connector, diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h index 2d7755e2b6c32..15d4690c74d60 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h @@ -50,7 +50,7 @@ #define AMDGPU_DM_MAX_NUM_EDP 2 -#define AMDGPU_DMUB_NOTIFICATION_MAX 6 +#define AMDGPU_DMUB_NOTIFICATION_MAX 7 #define HDMI_AMD_VENDOR_SPECIFIC_DATA_BLOCK_IEEE_REGISTRATION_ID 0x00001A #define AMD_VSDB_VERSION_3_FEATURECAP_REPLAYMODE 0x40 From f57b77d667dc6bd2b114d08d04b03869539209f6 Mon Sep 17 00:00:00 2001 From: Yihan Zhu Date: Mon, 26 Aug 2024 14:44:04 -0400 Subject: [PATCH 213/352] drm/amd/display: Enable DML2 override_det_buffer_size_kbytes [WHY] Corrupted screen will be observed when 4k144 DP/HDMI display and 4k144 eDP are connected, changing eDP refresh rate from 60Hz to 144Hz. [HOW] override_det_buffer_size_kbytes should be true for DCN35/DCN351. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Roman Li Reviewed-by: Nicholas Kazlauskas Signed-off-by: Yihan Zhu Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c | 1 + drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c index 46ad684fe1920..893a9d9ee870d 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c @@ -2155,6 +2155,7 @@ static bool dcn35_resource_construct( dc->dml2_options.max_segments_per_hubp = 24; dc->dml2_options.det_segment_size = DCN3_2_DET_SEG_SIZE;/*todo*/ + dc->dml2_options.override_det_buffer_size_kbytes = true; if (dc->config.sdpif_request_limit_words_per_umc == 0) dc->config.sdpif_request_limit_words_per_umc = 16;/*todo*/ diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c index 4c5e722baa3a6..514c6d56925d5 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c @@ -2133,6 +2133,7 @@ static bool dcn351_resource_construct( dc->dml2_options.max_segments_per_hubp = 24; dc->dml2_options.det_segment_size = DCN3_2_DET_SEG_SIZE;/*todo*/ + dc->dml2_options.override_det_buffer_size_kbytes = true; if (dc->config.sdpif_request_limit_words_per_umc == 0) dc->config.sdpif_request_limit_words_per_umc = 16;/*todo*/ From 0765b2afc1118a6ab5fee624e206c782d70db28a Mon Sep 17 00:00:00 2001 From: Dillon Varone Date: Mon, 26 Aug 2024 17:08:33 -0400 Subject: [PATCH 214/352] drm/amd/display: Block timing sync for different output formats in pmo [WHY & HOW] If the output format is different for HDMI TMDS signals, they are not synchronizable. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Alvin Lee Signed-off-by: Dillon Varone Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c index d63558ee31351..1cf9015e854a9 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c @@ -940,9 +940,11 @@ static void build_synchronized_timing_groups( /* find synchronizable timing groups */ for (j = i + 1; j < display_config->display_config.num_streams; j++) { if (memcmp(master_timing, - &display_config->display_config.stream_descriptors[j].timing, - sizeof(struct dml2_timing_cfg)) == 0 && - display_config->display_config.stream_descriptors[i].output.output_encoder == display_config->display_config.stream_descriptors[j].output.output_encoder) { + &display_config->display_config.stream_descriptors[j].timing, + sizeof(struct dml2_timing_cfg)) == 0 && + display_config->display_config.stream_descriptors[i].output.output_encoder == display_config->display_config.stream_descriptors[j].output.output_encoder && + (display_config->display_config.stream_descriptors[i].output.output_encoder != dml2_hdmi || //hdmi requires formats match + display_config->display_config.stream_descriptors[i].output.output_format == display_config->display_config.stream_descriptors[j].output.output_format)) { set_bit_in_bitfield(&pmo->scratch.pmo_dcn4.synchronized_timing_group_masks[timing_group_idx], j); set_bit_in_bitfield(&stream_mapped_mask, j); } From 09cb922c4e14e6531979bff4e6bb3babcd9cb188 Mon Sep 17 00:00:00 2001 From: Samson Tam Date: Tue, 27 Aug 2024 11:53:10 -0400 Subject: [PATCH 215/352] drm/amd/display: Add debug options to change sharpen policies [WHY] Add options to change sharpen policy based on surface format and scaling ratios. [HOW] Add sharpen_policy to change policy based on surface format and scale_to_sharpness_policy based on scaling ratios. Reviewed-by: Jun Lei Signed-off-by: Samson Tam Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dc.h | 2 + .../gpu/drm/amd/display/dc/dc_spl_translate.c | 5 ++ drivers/gpu/drm/amd/display/dc/spl/dc_spl.c | 34 ++++++-- .../display/dc/spl/dc_spl_isharp_filters.c | 85 +++++++++++++++++-- .../display/dc/spl/dc_spl_isharp_filters.h | 9 +- .../gpu/drm/amd/display/dc/spl/dc_spl_types.h | 12 +++ 6 files changed, 128 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index dcf8a90e961d3..7c4812dd1a718 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -1056,6 +1056,8 @@ struct dc_debug_options { unsigned int force_lls; bool notify_dpia_hr_bw; bool enable_ips_visual_confirm; + unsigned int sharpen_policy; + unsigned int scale_to_sharpness_policy; }; diff --git a/drivers/gpu/drm/amd/display/dc/dc_spl_translate.c b/drivers/gpu/drm/amd/display/dc/dc_spl_translate.c index f711fc2e3e651..603552dbd7716 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_spl_translate.c +++ b/drivers/gpu/drm/amd/display/dc/dc_spl_translate.c @@ -186,6 +186,11 @@ void translate_SPL_in_params_from_pipe_ctx(struct pipe_ctx *pipe_ctx, struct spl spl_in->h_active = pipe_ctx->plane_res.scl_data.h_active; spl_in->v_active = pipe_ctx->plane_res.scl_data.v_active; + + spl_in->debug.sharpen_policy = (enum sharpen_policy)pipe_ctx->stream->ctx->dc->debug.sharpen_policy; + spl_in->debug.scale_to_sharpness_policy = + (enum scale_to_sharpness_policy)pipe_ctx->stream->ctx->dc->debug.scale_to_sharpness_policy; + /* Check if it is stream is in fullscreen and if its HDR. * Use this to determine sharpness levels */ diff --git a/drivers/gpu/drm/amd/display/dc/spl/dc_spl.c b/drivers/gpu/drm/amd/display/dc/spl/dc_spl.c index a59aa6b596875..f7a654b3a092f 100644 --- a/drivers/gpu/drm/amd/display/dc/spl/dc_spl.c +++ b/drivers/gpu/drm/amd/display/dc/spl/dc_spl.c @@ -813,6 +813,14 @@ static bool enable_easf(struct spl_in *spl_in, struct spl_scratch *spl_scratch) return skip_easf; } +/* Check if video is in fullscreen mode */ +static bool spl_is_video_fullscreen(struct spl_in *spl_in) +{ + if (spl_is_yuv420(spl_in->basic_in.format) && spl_in->is_fullscreen) + return true; + return false; +} + static bool spl_get_isharp_en(struct spl_in *spl_in, struct spl_scratch *spl_scratch) { @@ -820,6 +828,7 @@ static bool spl_get_isharp_en(struct spl_in *spl_in, int vratio = 0; int hratio = 0; struct spl_taps taps = spl_scratch->scl_data.taps; + bool fullscreen = spl_is_video_fullscreen(spl_in); /* Return if adaptive sharpness is disabled */ if (spl_in->adaptive_sharpness.enable == false) @@ -835,9 +844,15 @@ static bool spl_get_isharp_en(struct spl_in *spl_in, // Scaling is up to 1:1 (no scaling) or upscaling /* - * Apply sharpness to all RGB surfaces and to - * NV12/P010 surfaces + * Apply sharpness to RGB and YUV (NV12/P010) + * surfaces based on policy setting */ + if (!spl_is_yuv420(spl_in->basic_in.format) && + (spl_in->debug.sharpen_policy == SHARPEN_YUV)) + return enable_isharp; + else if ((spl_is_yuv420(spl_in->basic_in.format) && !fullscreen) && + (spl_in->debug.sharpen_policy == SHARPEN_RGB_FULLSCREEN_YUV)) + return enable_isharp; /* * Apply sharpness if supports horizontal taps 4,6 AND @@ -1562,7 +1577,7 @@ static void spl_set_isharp_data(struct dscl_prog_data *dscl_prog_data, struct adaptive_sharpness adp_sharpness, bool enable_isharp, enum linear_light_scaling lls_pref, enum spl_pixel_format format, const struct spl_scaler_data *data, struct spl_fixed31_32 ratio, - enum system_setup setup) + enum system_setup setup, enum scale_to_sharpness_policy scale_to_sharpness_policy) { /* Turn off sharpener if not required */ if (!enable_isharp) { @@ -1570,6 +1585,11 @@ static void spl_set_isharp_data(struct dscl_prog_data *dscl_prog_data, return; } + spl_build_isharp_1dlut_from_reference_curve(ratio, setup, adp_sharpness, + scale_to_sharpness_policy); + dscl_prog_data->isharp_delta = spl_get_pregen_filter_isharp_1D_lut(setup); + dscl_prog_data->sharpness_level = adp_sharpness.sharpness_level; + dscl_prog_data->isharp_en = 1; // ISHARP_EN // Set ISHARP_NOISEDET_MODE if htaps = 6-tap if (data->taps.h_taps == 6) { @@ -1667,11 +1687,6 @@ static void spl_set_isharp_data(struct dscl_prog_data *dscl_prog_data, dscl_prog_data->isharp_lba.base_seg[5] = 0; // ISHARP LBA PWL for Seg 5. BASE value in U0.6 format } - - spl_build_isharp_1dlut_from_reference_curve(ratio, setup, adp_sharpness); - dscl_prog_data->isharp_delta = spl_get_pregen_filter_isharp_1D_lut(setup); - dscl_prog_data->sharpness_level = adp_sharpness.sharpness_level; - // Program the nldelta soft clip values if (lls_pref == LLS_PREF_YES) { dscl_prog_data->isharp_nldelta_sclip.enable_p = 0; /* ISHARP_NLDELTA_SCLIP_EN_P */ @@ -1766,7 +1781,8 @@ bool spl_calculate_scaler_params(struct spl_in *spl_in, struct spl_out *spl_out) isharp_scale_ratio = spl_scratch.scl_data.recip_ratios.horz; spl_set_isharp_data(spl_out->dscl_prog_data, spl_in->adaptive_sharpness, enable_isharp, - spl_in->lls_pref, spl_in->basic_in.format, data, isharp_scale_ratio, setup); + spl_in->lls_pref, spl_in->basic_in.format, data, isharp_scale_ratio, setup, + spl_in->debug.scale_to_sharpness_policy); return res; } diff --git a/drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.c b/drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.c index 33712f50d303b..e0572252c6404 100644 --- a/drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.c +++ b/drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.c @@ -500,6 +500,15 @@ struct isharp_1D_lut_pregen filter_isharp_1D_lut_pregen[NUM_SHARPNESS_SETUPS] = }, }; +struct scale_ratio_to_sharpness_level_adj sharpness_level_adj[NUM_SHARPNESS_ADJ_LEVELS] = { + {1125, 1000, 0}, + {11, 10, 1}, + {1075, 1000, 2}, + {105, 100, 3}, + {1025, 1000, 4}, + {1, 1, 5}, +}; + const uint32_t *spl_get_filter_isharp_1D_lut_0(void) { return filter_isharp_1D_lut_0; @@ -541,19 +550,72 @@ uint16_t *spl_get_filter_isharp_bs_3tap_64p(void) return filter_isharp_bs_3tap_64p_s1_12; } -static unsigned int spl_calculate_sharpness_level(int discrete_sharpness_level, enum system_setup setup, - struct spl_sharpness_range sharpness_range) +static unsigned int spl_calculate_sharpness_level_adj(struct spl_fixed31_32 ratio) +{ + int j; + struct spl_fixed31_32 ratio_level; + struct scale_ratio_to_sharpness_level_adj *lookup_ptr; + unsigned int sharpness_level_down_adj; + + /* + * Adjust sharpness level based on current scaling ratio + * + * We have 5 discrete scaling ratios which we will use to adjust the + * sharpness level down by 1 as we pass each ratio. The ratios + * are + * + * 1.125 upscale and higher - no adj + * 1.100 - under 1.125 - adj level down 1 + * 1.075 - under 1.100 - adj level down 2 + * 1.050 - under 1.075 - adj level down 3 + * 1.025 - under 1.050 - adj level down 4 + * 1.000 - under 1.025 - adj level down 5 + * + */ + j = 0; + sharpness_level_down_adj = 0; + lookup_ptr = sharpness_level_adj; + while (j < NUM_SHARPNESS_ADJ_LEVELS) { + ratio_level = spl_fixpt_from_fraction(lookup_ptr->ratio_numer, + lookup_ptr->ratio_denom); + if (ratio.value >= ratio_level.value) { + sharpness_level_down_adj = lookup_ptr->level_down_adj; + break; + } + lookup_ptr++; + j++; + } + return sharpness_level_down_adj; +} + +static unsigned int spl_calculate_sharpness_level(struct spl_fixed31_32 ratio, + int discrete_sharpness_level, enum system_setup setup, + struct spl_sharpness_range sharpness_range, + enum scale_to_sharpness_policy scale_to_sharpness_policy) { unsigned int sharpness_level = 0; + unsigned int sharpness_level_down_adj = 0; int min_sharpness, max_sharpness, mid_sharpness; + /* + * Adjust sharpness level if policy requires we adjust it based on + * scale ratio. Based on scale ratio, we may adjust the sharpness + * level down by a certain number of steps. We will not select + * a sharpness value of 0 so the lowest sharpness level will be + * 0 or 1 depending on what the min_sharpness is + * + * If the policy is no required, this code maybe removed at a later + * date + */ switch (setup) { case HDR_L: min_sharpness = sharpness_range.hdr_rgb_min; max_sharpness = sharpness_range.hdr_rgb_max; mid_sharpness = sharpness_range.hdr_rgb_mid; + if (scale_to_sharpness_policy == SCALE_TO_SHARPNESS_ADJ_ALL) + sharpness_level_down_adj = spl_calculate_sharpness_level_adj(ratio); break; case HDR_NL: /* currently no use case, use Non-linear SDR values for now */ @@ -561,15 +623,26 @@ static unsigned int spl_calculate_sharpness_level(int discrete_sharpness_level, min_sharpness = sharpness_range.sdr_yuv_min; max_sharpness = sharpness_range.sdr_yuv_max; mid_sharpness = sharpness_range.sdr_yuv_mid; + if (scale_to_sharpness_policy >= SCALE_TO_SHARPNESS_ADJ_YUV) + sharpness_level_down_adj = spl_calculate_sharpness_level_adj(ratio); break; case SDR_L: default: min_sharpness = sharpness_range.sdr_rgb_min; max_sharpness = sharpness_range.sdr_rgb_max; mid_sharpness = sharpness_range.sdr_rgb_mid; + if (scale_to_sharpness_policy == SCALE_TO_SHARPNESS_ADJ_ALL) + sharpness_level_down_adj = spl_calculate_sharpness_level_adj(ratio); break; } + if ((min_sharpness == 0) && (sharpness_level_down_adj >= discrete_sharpness_level)) + discrete_sharpness_level = 1; + else if (sharpness_level_down_adj >= discrete_sharpness_level) + discrete_sharpness_level = 0; + else + discrete_sharpness_level -= sharpness_level_down_adj; + int lower_half_step_size = (mid_sharpness - min_sharpness) / 5; int upper_half_step_size = (max_sharpness - mid_sharpness) / 5; @@ -584,7 +657,7 @@ static unsigned int spl_calculate_sharpness_level(int discrete_sharpness_level, } void spl_build_isharp_1dlut_from_reference_curve(struct spl_fixed31_32 ratio, enum system_setup setup, - struct adaptive_sharpness sharpness) + struct adaptive_sharpness sharpness, enum scale_to_sharpness_policy scale_to_sharpness_policy) { uint8_t *byte_ptr_1dlut_src, *byte_ptr_1dlut_dst; struct spl_fixed31_32 sharp_base, sharp_calc, sharp_level; @@ -594,8 +667,9 @@ void spl_build_isharp_1dlut_from_reference_curve(struct spl_fixed31_32 ratio, en uint32_t filter_pregen_store[ISHARP_LUT_TABLE_SIZE]; /* Custom sharpnessX1000 value */ - unsigned int sharpnessX1000 = spl_calculate_sharpness_level(sharpness.sharpness_level, - setup, sharpness.sharpness_range); + unsigned int sharpnessX1000 = spl_calculate_sharpness_level(ratio, + sharpness.sharpness_level, setup, + sharpness.sharpness_range, scale_to_sharpness_policy); sharp_level = spl_fixpt_from_fraction(sharpnessX1000, 1000); /* @@ -606,7 +680,6 @@ void spl_build_isharp_1dlut_from_reference_curve(struct spl_fixed31_32 ratio, en (filter_isharp_1D_lut_pregen[setup].sharpness_denom == 1000)) return; - /* * Calculate LUT_128_gained with this equation: * diff --git a/drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.h b/drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.h index fe0b12571f2c5..afcc66206ca2a 100644 --- a/drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.h +++ b/drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.h @@ -20,11 +20,11 @@ uint16_t *spl_get_filter_isharp_bs_3tap_64p(void); const uint16_t *spl_get_filter_isharp_wide_6tap_64p(void); uint16_t *spl_dscl_get_blur_scale_coeffs_64p(int taps); -struct scale_ratio_to_sharpness_level_lookup { +#define NUM_SHARPNESS_ADJ_LEVELS 6 +struct scale_ratio_to_sharpness_level_adj { unsigned int ratio_numer; unsigned int ratio_denom; - unsigned int sharpness_numer; - unsigned int sharpness_denom; + unsigned int level_down_adj; /* adjust sharpness level down */ }; struct isharp_1D_lut_pregen { @@ -45,6 +45,7 @@ void spl_init_blur_scale_coeffs(void); void spl_set_blur_scale_data(struct dscl_prog_data *dscl_prog_data, const struct spl_scaler_data *data); -void spl_build_isharp_1dlut_from_reference_curve(struct spl_fixed31_32 ratio, enum system_setup setup, struct adaptive_sharpness sharpness); +void spl_build_isharp_1dlut_from_reference_curve(struct spl_fixed31_32 ratio, enum system_setup setup, + struct adaptive_sharpness sharpness, enum scale_to_sharpness_policy scale_to_sharpness_policy); uint32_t *spl_get_pregen_filter_isharp_1D_lut(enum system_setup setup); #endif /* __DC_SPL_ISHARP_FILTERS_H__ */ diff --git a/drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h b/drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h index 74f2a8c42f4f8..425d4a282c7a7 100644 --- a/drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h +++ b/drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h @@ -487,6 +487,16 @@ enum linear_light_scaling { // convert it in translation logic LLS_PREF_YES, LLS_PREF_NO }; +enum sharpen_policy { + SHARPEN_ALWAYS = 0, + SHARPEN_YUV = 1, + SHARPEN_RGB_FULLSCREEN_YUV = 2 +}; +enum scale_to_sharpness_policy { + NO_SCALE_TO_SHARPNESS_ADJ = 0, + SCALE_TO_SHARPNESS_ADJ_YUV = 1, + SCALE_TO_SHARPNESS_ADJ_ALL = 2 +}; struct spl_funcs { void (*spl_calc_lb_num_partitions) (bool alpha_en, @@ -499,6 +509,8 @@ struct spl_funcs { struct spl_debug { int visual_confirm_base_offset; int visual_confirm_dpp_offset; + enum sharpen_policy sharpen_policy; + enum scale_to_sharpness_policy scale_to_sharpness_policy; }; struct spl_in { From 401c90c4d64f2227fc2f4c02d2ad23296bf5ca6f Mon Sep 17 00:00:00 2001 From: Nicholas Kazlauskas Date: Tue, 27 Aug 2024 14:13:10 -0400 Subject: [PATCH 216/352] drm/amd/display: Block dynamic IPS2 on DCN35 for incompatible FW versions [WHY] Hangs with Z8 can occur if running an older unfixed PMFW version. [HOW] Fallback to RCG only for dynamic IPS2 states if it's not newer than 93.12. Limit to DCN35. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Charlene Liu Signed-off-by: Nicholas Kazlauskas Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c index 97164b5585a84..b46a3afe48ca7 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c @@ -1222,6 +1222,12 @@ void dcn35_clk_mgr_construct( ctx->dc->debug.disable_dpp_power_gate = false; ctx->dc->debug.disable_hubp_power_gate = false; ctx->dc->debug.disable_dsc_power_gate = false; + + /* Disable dynamic IPS2 in older PMFW (93.12) for Z8 interop. */ + if (ctx->dc->config.disable_ips == DMUB_IPS_ENABLE && + ctx->dce_version == DCN_VERSION_3_5 && + ((clk_mgr->base.smu_ver & 0x00FFFFFF) <= 0x005d0c00)) + ctx->dc->config.disable_ips = DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF; } else { /*let's reset the config control flag*/ ctx->dc->config.disable_ips = DMUB_IPS_DISABLE_ALL; /*pmfw not support it, disable it all*/ From 3766a840e093d30e1a2522f650d8a6ac892a8719 Mon Sep 17 00:00:00 2001 From: Martin Tsai Date: Mon, 22 Jul 2024 14:12:25 +0800 Subject: [PATCH 217/352] drm/amd/display: Clean up dsc blocks in accelerated mode [WHY] DSC on eDP could be enabled during VBIOS post. The enabled DSC may not be disabled when enter to OS, once the system was in second screen only mode before entering to S4. In this case, OS will not send setTimings to reset eDP path again. The enabled DSC HW will make a new stream without DSC cannot output normally if it reused this pipe with enabled DSC. [HOW] In accelerated mode, to clean up DSC blocks if eDP is on link but not active when we are not in fast boot and seamless boot. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Charlene Liu Signed-off-by: Martin Tsai Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../amd/display/dc/hwss/dce110/dce110_hwseq.c | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c index d52ce58c6a987..aa7479b318982 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c @@ -57,6 +57,7 @@ #include "panel_cntl.h" #include "dc_state_priv.h" #include "dpcd_defs.h" +#include "dsc.h" /* include DCE11 register header files */ #include "dce/dce_11_0_d.h" #include "dce/dce_11_0_sh_mask.h" @@ -1823,6 +1824,48 @@ static void get_edp_links_with_sink( } } +static void clean_up_dsc_blocks(struct dc *dc) +{ + struct display_stream_compressor *dsc = NULL; + struct timing_generator *tg = NULL; + struct stream_encoder *se = NULL; + struct dccg *dccg = dc->res_pool->dccg; + struct pg_cntl *pg_cntl = dc->res_pool->pg_cntl; + int i; + + if (dc->ctx->dce_version != DCN_VERSION_3_5 && + dc->ctx->dce_version != DCN_VERSION_3_51) + return; + + for (i = 0; i < dc->res_pool->res_cap->num_dsc; i++) { + struct dcn_dsc_state s = {0}; + + dsc = dc->res_pool->dscs[i]; + dsc->funcs->dsc_read_state(dsc, &s); + if (s.dsc_fw_en) { + /* disable DSC in OPTC */ + if (i < dc->res_pool->timing_generator_count) { + tg = dc->res_pool->timing_generators[i]; + tg->funcs->set_dsc_config(tg, OPTC_DSC_DISABLED, 0, 0); + } + /* disable DSC in stream encoder */ + if (i < dc->res_pool->stream_enc_count) { + se = dc->res_pool->stream_enc[i]; + se->funcs->dp_set_dsc_config(se, OPTC_DSC_DISABLED, 0, 0); + se->funcs->dp_set_dsc_pps_info_packet(se, false, NULL, true); + } + /* disable DSC block */ + if (dccg->funcs->set_ref_dscclk) + dccg->funcs->set_ref_dscclk(dccg, dsc->inst); + dsc->funcs->dsc_disable(dsc); + + /* power down DSC */ + if (pg_cntl != NULL) + pg_cntl->funcs->dsc_pg_control(pg_cntl, dsc->inst, false); + } + } +} + /* * When ASIC goes from VBIOS/VGA mode to driver/accelerated mode we need: * 1. Power down all DC HW blocks @@ -1927,6 +1970,13 @@ void dce110_enable_accelerated_mode(struct dc *dc, struct dc_state *context) clk_mgr_exit_optimized_pwr_state(dc, dc->clk_mgr); power_down_all_hw_blocks(dc); + + /* DSC could be enabled on eDP during VBIOS post. + * To clean up dsc blocks if eDP is in link but not active. + */ + if (edp_link_with_sink && (edp_stream_num == 0)) + clean_up_dsc_blocks(dc); + disable_vga_and_power_gate_all_controllers(dc); if (edp_link_with_sink && !keep_edp_vdd_on) dc->hwss.edp_power_control(edp_link_with_sink, false); From ae5100805f98641ea4112241e350485c97936bbe Mon Sep 17 00:00:00 2001 From: Sung Joon Kim Date: Tue, 27 Aug 2024 14:49:44 -0400 Subject: [PATCH 218/352] drm/amd/display: Disable SYMCLK32_LE root clock gating [WHY & HOW] On display on sequence, enabling SYMCLK32_LE root clock gating causes issue in link training so disabling it is needed. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Nicholas Kazlauskas Signed-off-by: Sung Joon Kim Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c index 514c6d56925d5..da9101b83e8c1 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c @@ -736,7 +736,7 @@ static const struct dc_debug_options debug_defaults_drv = { .hdmichar = true, .dpstream = true, .symclk32_se = true, - .symclk32_le = true, + .symclk32_le = false, .symclk_fe = true, .physymclk = false, .dpiasymclk = true, From cf4cebcec619d963fa7496018f03cb0ff00dc257 Mon Sep 17 00:00:00 2001 From: Peichen Huang Date: Thu, 22 Aug 2024 14:50:07 +0800 Subject: [PATCH 219/352] drm/amd/display: Restructure dpia link training [WHY] We intend to consolidate dp tunneling and conventional dp link training. [HOW] 1. Use the same link training entry for both dp and dpia 2. Move SET_CONFIG of non-transparent mode to dmub side 3. Add set_tps_notification dmub_cmd to notify tps request for non-transparent dpia link training 4. Check dpcd request result and abort link training early if dpia aux tunneling fails 5. Add option to avoid affect old product 6. Separately handle wait_time_microsec for dpia Reviewed-by: Cruise Hung Reviewed-by: George Shen Reviewed-by: Meenakshikumar Somasundaram Signed-off-by: Peichen Huang Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 21 +++++ drivers/gpu/drm/amd/display/dc/dc.h | 6 +- .../amd/display/dc/link/hwss/link_hwss_dpia.c | 31 ++++++- .../dc/link/protocols/link_dp_training.c | 80 ++++++++++++++++--- .../dc/link/protocols/link_dp_training.h | 16 +++- .../link/protocols/link_dp_training_8b_10b.c | 21 ++--- .../dc/link/protocols/link_dp_training_dpia.c | 64 ++++++++------- .../dc/link/protocols/link_dp_training_dpia.h | 19 +++++ drivers/gpu/drm/amd/display/dmub/dmub_srv.h | 1 + .../gpu/drm/amd/display/dmub/inc/dmub_cmd.h | 25 +++++- .../gpu/drm/amd/display/dmub/src/dmub_dcn35.c | 1 + 11 files changed, 233 insertions(+), 52 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 243928b0a39f6..eb7c7681bdd9e 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -5755,6 +5755,27 @@ enum dc_status dc_process_dmub_set_mst_slots(const struct dc *dc, return DC_OK; } +/** + * dc_process_dmub_dpia_set_tps_notification - Submits tps notification + * + * @dc: [in] dc structure + * @link_index: [in] link index + * @ts: [in] request tps + * + * Submits set_tps_notification command to dmub via inbox message + */ +void dc_process_dmub_dpia_set_tps_notification(const struct dc *dc, uint32_t link_index, uint8_t tps) +{ + union dmub_rb_cmd cmd = {0}; + + cmd.set_tps_notification.header.type = DMUB_CMD__DPIA; + cmd.set_tps_notification.header.sub_type = DMUB_CMD__DPIA_SET_TPS_NOTIFICATION; + cmd.set_tps_notification.tps_notification.instance = dc->links[link_index]->ddc_hw_inst; + cmd.set_tps_notification.tps_notification.tps = tps; + + dc_wake_and_execute_dmub_cmd(dc->ctx, &cmd, DM_DMUB_WAIT_TYPE_WAIT); +} + /** * dc_process_dmub_dpia_hpd_int_enable - Submits DPIA DPD interruption * diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 7c4812dd1a718..133cac4d9fc4f 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -462,6 +462,7 @@ struct dc_config { bool support_edp0_on_dp1; unsigned int enable_fpo_flicker_detection; bool disable_hbr_audio_dp2; + bool consolidated_dpia_dp_lt; }; enum visual_confirm { @@ -762,7 +763,8 @@ union dpia_debug_options { uint32_t disable_mst_dsc_work_around:1; /* bit 3 */ uint32_t enable_force_tbt3_work_around:1; /* bit 4 */ uint32_t disable_usb4_pm_support:1; /* bit 5 */ - uint32_t reserved:26; + uint32_t enable_consolidated_dpia_dp_lt:1; /* bit 6 */ + uint32_t reserved:25; } bits; uint32_t raw; }; @@ -2525,6 +2527,8 @@ enum dc_status dc_process_dmub_set_mst_slots(const struct dc *dc, uint8_t mst_alloc_slots, uint8_t *mst_slots_in_use); +void dc_process_dmub_dpia_set_tps_notification(const struct dc *dc, uint32_t link_index, uint8_t tps); + void dc_process_dmub_dpia_hpd_int_enable(const struct dc *dc, uint32_t hpd_int_enable); diff --git a/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dpia.c b/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dpia.c index 46fb3649bc86a..6499807af72a1 100644 --- a/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dpia.c +++ b/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dpia.c @@ -50,8 +50,31 @@ static void update_dpia_stream_allocation_table(struct dc_link *link, DC_LOG_MST("dpia : status[%d]: alloc_slots[%d]: used_slots[%d]\n", status, mst_alloc_slots, prev_mst_slots_in_use); - ASSERT(link_enc); - link_enc->funcs->update_mst_stream_allocation_table(link_enc, table); + if (link_enc) + link_enc->funcs->update_mst_stream_allocation_table(link_enc, table); +} + +static void set_dio_dpia_link_test_pattern(struct dc_link *link, + const struct link_resource *link_res, + struct encoder_set_dp_phy_pattern_param *tp_params) +{ + if (tp_params->dp_phy_pattern != DP_TEST_PATTERN_VIDEO_MODE) + return; + + struct link_encoder *link_enc = link_enc_cfg_get_link_enc(link); + + if (!link_enc) + return; + + link_enc->funcs->dp_set_phy_pattern(link_enc, tp_params); + link->dc->link_srv->dp_trace_source_sequence(link, DPCD_SOURCE_SEQ_AFTER_SET_SOURCE_PATTERN); +} + +static void set_dio_dpia_lane_settings(struct dc_link *link, + const struct link_resource *link_res, + const struct dc_link_settings *link_settings, + const struct dc_lane_settings lane_settings[LANE_COUNT_DP_MAX]) +{ } static const struct link_hwss dpia_link_hwss = { @@ -65,8 +88,8 @@ static const struct link_hwss dpia_link_hwss = { .ext = { .set_throttled_vcp_size = set_dio_throttled_vcp_size, .enable_dp_link_output = enable_dio_dp_link_output, - .set_dp_link_test_pattern = set_dio_dp_link_test_pattern, - .set_dp_lane_settings = set_dio_dp_lane_settings, + .set_dp_link_test_pattern = set_dio_dpia_link_test_pattern, + .set_dp_lane_settings = set_dio_dpia_lane_settings, .update_stream_allocation_table = update_dpia_stream_allocation_table, }, }; diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.c index 988999c444754..27b881f947e8b 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.c +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.c @@ -515,6 +515,41 @@ bool dp_is_interlane_aligned(union lane_align_status_updated align_status) return align_status.bits.INTERLANE_ALIGN_DONE == 1; } +bool dp_check_interlane_aligned(union lane_align_status_updated align_status, + struct dc_link *link, + uint8_t retries) +{ + /* Take into consideration corner case for DP 1.4a LL Compliance CTS as USB4 + * has to share encoders unlike DP and USBC + */ + return (dp_is_interlane_aligned(align_status) || + (link->skip_fallback_on_link_loss && retries)); +} + +uint32_t dp_get_eq_aux_rd_interval( + const struct dc_link *link, + const struct link_training_settings *lt_settings, + uint32_t offset, + uint8_t retries) +{ + if (link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA) { + if (offset == 0 && retries == 1 && lt_settings->lttpr_mode == LTTPR_MODE_NON_TRANSPARENT) + return max(lt_settings->eq_pattern_time, (uint32_t) DPIA_CLK_SYNC_DELAY); + else + return dpia_get_eq_aux_rd_interval(link, lt_settings, offset); + } else if (is_repeater(lt_settings, offset)) + return dp_translate_training_aux_read_interval( + link->dpcd_caps.lttpr_caps.aux_rd_interval[offset - 1]); + else + return lt_settings->eq_pattern_time; +} + +bool dp_check_dpcd_reqeust_status(const struct dc_link *link, + enum dc_status status) +{ + return (status != DC_OK && link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA); +} + enum link_training_result dp_check_link_loss_status( struct dc_link *link, const struct link_training_settings *link_training_setting) @@ -973,13 +1008,17 @@ void repeater_training_done(struct dc_link *link, uint32_t offset) dpcd_pattern.v1_4.TRAINING_PATTERN_SET); } -static void dpcd_exit_training_mode(struct dc_link *link, enum dp_link_encoding encoding) +static enum link_training_result dpcd_exit_training_mode(struct dc_link *link, enum dp_link_encoding encoding) { + enum dc_status status; uint8_t sink_status = 0; uint8_t i; /* clear training pattern set */ - dpcd_set_training_pattern(link, DP_TRAINING_PATTERN_VIDEOIDLE); + status = dpcd_set_training_pattern(link, DP_TRAINING_PATTERN_VIDEOIDLE); + + if (dp_check_dpcd_reqeust_status(link, status)) + return LINK_TRAINING_ABORT; if (encoding == DP_128b_132b_ENCODING) { /* poll for intra-hop disable */ @@ -990,6 +1029,8 @@ static void dpcd_exit_training_mode(struct dc_link *link, enum dp_link_encoding fsleep(1000); } } + + return LINK_TRAINING_SUCCESS; } enum dc_status dpcd_configure_channel_coding(struct dc_link *link, @@ -1013,17 +1054,18 @@ enum dc_status dpcd_configure_channel_coding(struct dc_link *link, return status; } -void dpcd_set_training_pattern( +enum dc_status dpcd_set_training_pattern( struct dc_link *link, enum dc_dp_training_pattern training_pattern) { + enum dc_status status; union dpcd_training_pattern dpcd_pattern = {0}; dpcd_pattern.v1_4.TRAINING_PATTERN_SET = dp_training_pattern_to_dpcd_training_pattern( link, training_pattern); - core_link_write_dpcd( + status = core_link_write_dpcd( link, DP_TRAINING_PATTERN_SET, &dpcd_pattern.raw, @@ -1033,6 +1075,8 @@ void dpcd_set_training_pattern( __func__, DP_TRAINING_PATTERN_SET, dpcd_pattern.v1_4.TRAINING_PATTERN_SET); + + return status; } enum dc_status dpcd_set_link_settings( @@ -1185,6 +1229,13 @@ void dpcd_set_lt_pattern_and_lane_settings( dpcd_lt_buffer[DP_TRAINING_PATTERN_SET - DP_TRAINING_PATTERN_SET] = dpcd_pattern.raw; + if (link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA) + dpia_set_tps_notification( + link, + lt_settings, + dpcd_pattern.v1_4.TRAINING_PATTERN_SET, + offset); + if (is_repeater(lt_settings, offset)) { DC_LOG_HW_LINK_TRAINING("%s\n LTTPR Repeater ID: %d\n 0x%X pattern = %x\n", __func__, @@ -1455,7 +1506,8 @@ static enum link_training_result dp_transition_to_video_idle( */ if (link->connector_signal != SIGNAL_TYPE_EDP && status == LINK_TRAINING_SUCCESS) { msleep(5); - status = dp_check_link_loss_status(link, lt_settings); + if (!link->skip_fallback_on_link_loss) + status = dp_check_link_loss_status(link, lt_settings); } return status; } @@ -1521,7 +1573,9 @@ enum link_training_result dp_perform_link_training( ASSERT(0); /* exit training mode */ - dpcd_exit_training_mode(link, encoding); + if ((dpcd_exit_training_mode(link, encoding) != LINK_TRAINING_SUCCESS || status == LINK_TRAINING_ABORT) && + link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA) + dpia_training_abort(link, <_settings, 0); /* switch to video idle */ if ((status == LINK_TRAINING_SUCCESS) || !skip_video_pattern) @@ -1599,8 +1653,7 @@ bool perform_link_training_with_retries( dp_perform_link_training_skip_aux(link, &pipe_ctx->link_res, &cur_link_settings); return true; } else { - /** @todo Consolidate USB4 DP and DPx.x training. */ - if (link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA) { + if (!link->dc->config.consolidated_dpia_dp_lt && link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA) { status = dpia_perform_link_training( link, &pipe_ctx->link_res, @@ -1629,8 +1682,17 @@ bool perform_link_training_with_retries( dp_trace_lt_total_count_increment(link, false); dp_trace_lt_result_update(link, status, false); dp_trace_set_lt_end_timestamp(link, false); - if (status == LINK_TRAINING_SUCCESS && !is_link_bw_low) + if (status == LINK_TRAINING_SUCCESS && !is_link_bw_low) { + // Update verified link settings to current one + // Because DPIA LT might fallback to lower link setting. + if (link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA && + stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST) { + link->verified_link_cap.link_rate = link->cur_link_settings.link_rate; + link->verified_link_cap.lane_count = link->cur_link_settings.lane_count; + dm_helpers_dp_mst_update_branch_bandwidth(link->ctx, link); + } return true; + } } fail_count++; diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.h b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.h index 851bd17317a0c..0b18aa35c33cb 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.h +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.h @@ -55,7 +55,7 @@ void dp_set_hw_test_pattern( uint8_t *custom_pattern, uint32_t custom_pattern_size); -void dpcd_set_training_pattern( +enum dc_status dpcd_set_training_pattern( struct dc_link *link, enum dc_dp_training_pattern training_pattern); @@ -182,4 +182,18 @@ uint32_t dp_translate_training_aux_read_interval( uint8_t dp_get_nibble_at_index(const uint8_t *buf, uint32_t index); + +bool dp_check_interlane_aligned(union lane_align_status_updated align_status, + struct dc_link *link, + uint8_t retries); + +uint32_t dp_get_eq_aux_rd_interval( + const struct dc_link *link, + const struct link_training_settings *lt_settings, + uint32_t offset, + uint8_t retries); + +bool dp_check_dpcd_reqeust_status(const struct dc_link *link, + enum dc_status status); + #endif /* __DC_LINK_DP_TRAINING_H__ */ diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_8b_10b.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_8b_10b.c index 2b4c15b0b4070..3bdce32a85e3c 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_8b_10b.c +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_8b_10b.c @@ -157,6 +157,7 @@ enum link_training_result perform_8b_10b_clock_recovery_sequence( struct link_training_settings *lt_settings, uint32_t offset) { + enum dc_status status; uint32_t retries_cr; uint32_t retry_count; uint32_t wait_time_microsec; @@ -216,7 +217,7 @@ enum link_training_result perform_8b_10b_clock_recovery_sequence( /* 4. Read lane status and requested drive * settings as set by the sink */ - dp_get_lane_status_and_lane_adjust( + status = dp_get_lane_status_and_lane_adjust( link, lt_settings, dpcd_lane_status, @@ -224,6 +225,9 @@ enum link_training_result perform_8b_10b_clock_recovery_sequence( dpcd_lane_adjust, offset); + if (dp_check_dpcd_reqeust_status(link, status)) + return LINK_TRAINING_ABORT; + /* 5. check CR done*/ if (dp_is_cr_done(lane_count, dpcd_lane_status)) { DC_LOG_HW_LINK_TRAINING("%s: Clock recovery OK\n", __func__); @@ -273,6 +277,7 @@ enum link_training_result perform_8b_10b_channel_equalization_sequence( struct link_training_settings *lt_settings, uint32_t offset) { + enum dc_status status; enum dc_dp_training_pattern tr_pattern; uint32_t retries_ch_eq; uint32_t wait_time_microsec; @@ -308,12 +313,7 @@ enum link_training_result perform_8b_10b_channel_equalization_sequence( dpcd_set_lane_settings(link, lt_settings, offset); /* 3. wait for receiver to lock-on*/ - wait_time_microsec = lt_settings->eq_pattern_time; - - if (is_repeater(lt_settings, offset)) - wait_time_microsec = - dp_translate_training_aux_read_interval( - link->dpcd_caps.lttpr_caps.aux_rd_interval[offset - 1]); + wait_time_microsec = dp_get_eq_aux_rd_interval(link, lt_settings, offset, retries_ch_eq); dp_wait_for_training_aux_rd_interval( link, @@ -322,7 +322,7 @@ enum link_training_result perform_8b_10b_channel_equalization_sequence( /* 4. Read lane status and requested * drive settings as set by the sink*/ - dp_get_lane_status_and_lane_adjust( + status = dp_get_lane_status_and_lane_adjust( link, lt_settings, dpcd_lane_status, @@ -330,6 +330,9 @@ enum link_training_result perform_8b_10b_channel_equalization_sequence( dpcd_lane_adjust, offset); + if (dp_check_dpcd_reqeust_status(link, status)) + return LINK_TRAINING_ABORT; + /* 5. check CR done*/ if (!dp_is_cr_done(lane_count, dpcd_lane_status)) return dpcd_lane_status[0].bits.CR_DONE_0 ? @@ -339,7 +342,7 @@ enum link_training_result perform_8b_10b_channel_equalization_sequence( /* 6. check CHEQ done*/ if (dp_is_ch_eq_done(lane_count, dpcd_lane_status) && dp_is_symbol_locked(lane_count, dpcd_lane_status) && - dp_is_interlane_aligned(dpcd_lane_status_updated)) + dp_check_interlane_aligned(dpcd_lane_status_updated, link, retries_ch_eq)) return LINK_TRAINING_SUCCESS; /* 7. update VS/PE/PC2 in lt_settings*/ diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.c index cd1975c03f38d..39e4b7dc9588f 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.c +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.c @@ -43,9 +43,6 @@ #define DC_LOGGER \ link->ctx->logger -/* The approximate time (us) it takes to transmit 9 USB4 DP clock sync packets. */ -#define DPIA_CLK_SYNC_DELAY 16000 - /* Extend interval between training status checks for manual testing. */ #define DPIA_DEBUG_EXTENDED_AUX_RD_INTERVAL_US 60000000 @@ -566,28 +563,6 @@ static enum link_training_result dpia_training_cr_phase( return result; } -/* Return status read interval during equalization phase. */ -static uint32_t dpia_get_eq_aux_rd_interval( - const struct dc_link *link, - const struct link_training_settings *lt_settings, - uint32_t hop) -{ - uint32_t wait_time_microsec; - - if (hop == DPRX) - wait_time_microsec = lt_settings->eq_pattern_time; - else - wait_time_microsec = - dp_translate_training_aux_read_interval( - link->dpcd_caps.lttpr_caps.aux_rd_interval[hop - 1]); - - /* Check debug option for extending aux read interval. */ - if (link->dc->debug.dpia_debug.bits.extend_aux_rd_interval) - wait_time_microsec = DPIA_DEBUG_EXTENDED_AUX_RD_INTERVAL_US; - - return wait_time_microsec; -} - /* Execute equalization phase of link training for specified hop in display * path in non-transparent mode: * - driver issues both DPCD and SET_CONFIG transactions. @@ -936,6 +911,22 @@ static enum link_training_result dpia_training_end( return result; } +/* Return status read interval during equalization phase. */ +uint32_t dpia_get_eq_aux_rd_interval( + const struct dc_link *link, + const struct link_training_settings *lt_settings, + uint32_t hop) +{ + /* Check debug option for extending aux read interval. */ + if (link->dc->debug.dpia_debug.bits.extend_aux_rd_interval) + return DPIA_DEBUG_EXTENDED_AUX_RD_INTERVAL_US; + else if (hop == DPRX) + return lt_settings->eq_pattern_time; + else + return dp_translate_training_aux_read_interval( + link->dpcd_caps.lttpr_caps.aux_rd_interval[hop - 1]); +} + /* When aborting training of specified hop in display path, clean up by: * - Attempting to clear DPCD TRAINING_PATTERN_SET, LINK_BW_SET and LANE_COUNT_SET. * - Sending SET_CONFIG(SET_LINK) with lane count and link rate set to 0. @@ -943,7 +934,7 @@ static enum link_training_result dpia_training_end( * @param link DPIA link being trained. * @param hop Hop in display path. DPRX = 0. */ -static void dpia_training_abort( +void dpia_training_abort( struct dc_link *link, struct link_training_settings *lt_settings, uint32_t hop) @@ -968,7 +959,26 @@ static void dpia_training_abort( core_link_write_dpcd(link, dpcd_tps_offset, &data, 1); core_link_write_dpcd(link, DP_LINK_BW_SET, &data, 1); core_link_write_dpcd(link, DP_LANE_COUNT_SET, &data, 1); - core_link_send_set_config(link, DPIA_SET_CFG_SET_LINK, data); + + if (!link->dc->config.consolidated_dpia_dp_lt) + core_link_send_set_config(link, DPIA_SET_CFG_SET_LINK, data); +} + +void dpia_set_tps_notification( + struct dc_link *link, + const struct link_training_settings *lt_settings, + uint8_t pattern, + uint32_t hop) +{ + uint8_t repeater_cnt = 0; /* Number of hops/repeaters in display path. */ + + if (lt_settings->lttpr_mode != LTTPR_MODE_NON_TRANSPARENT || pattern == DPCD_TRAINING_PATTERN_VIDEOIDLE) + return; + + repeater_cnt = dp_parse_lttpr_repeater_count(link->dpcd_caps.lttpr_caps.phy_repeater_cnt); + + if (hop != repeater_cnt) + dc_process_dmub_dpia_set_tps_notification(link->ctx->dc, link->link_index, pattern); } enum link_training_result dpia_perform_link_training( diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.h b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.h index b39fb9faf1c2c..9f4eceb494c2d 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.h +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.h @@ -28,6 +28,9 @@ #define __DC_LINK_DP_TRAINING_DPIA_H__ #include "link_dp_training.h" +/* The approximate time (us) it takes to transmit 9 USB4 DP clock sync packets. */ +#define DPIA_CLK_SYNC_DELAY 16000 + /* Train DP tunneling link for USB4 DPIA display endpoint. * DPIA equivalent of dc_link_dp_perfrorm_link_training. * Aborts link training upon detection of sink unplug. @@ -38,4 +41,20 @@ enum link_training_result dpia_perform_link_training( const struct dc_link_settings *link_setting, bool skip_video_pattern); +void dpia_training_abort( + struct dc_link *link, + struct link_training_settings *lt_settings, + uint32_t hop); + +uint32_t dpia_get_eq_aux_rd_interval( + const struct dc_link *link, + const struct link_training_settings *lt_settings, + uint32_t hop); + +void dpia_set_tps_notification( + struct dc_link *link, + const struct link_training_settings *lt_settings, + uint8_t pattern, + uint32_t offset); + #endif /* __DC_LINK_DP_TRAINING_DPIA_H__ */ diff --git a/drivers/gpu/drm/amd/display/dmub/dmub_srv.h b/drivers/gpu/drm/amd/display/dmub/dmub_srv.h index cd70453aeae05..fe5b6f7a3eb1e 100644 --- a/drivers/gpu/drm/amd/display/dmub/dmub_srv.h +++ b/drivers/gpu/drm/amd/display/dmub/dmub_srv.h @@ -300,6 +300,7 @@ struct dmub_srv_hw_params { enum dmub_ips_disable_type disable_ips; bool disallow_phy_access; bool disable_sldo_opt; + bool enable_non_transparent_setconfig; }; /** diff --git a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h index e20c220aa8b4c..ebcf68bfae2b3 100644 --- a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h +++ b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h @@ -682,7 +682,7 @@ union dmub_fw_boot_options { uint32_t gpint_scratch8: 1; /* 1 if GPINT is in scratch8*/ uint32_t usb4_cm_version: 1; /**< 1 CM support */ uint32_t dpia_hpd_int_enable_supported: 1; /* 1 if dpia hpd int enable supported */ - uint32_t reserved0: 1; + uint32_t enable_non_transparent_setconfig: 1; /* 1 if dpia use conventional dp lt flow*/ uint32_t disable_clk_ds: 1; /* 1 if disallow dispclk_ds and dppclk_ds*/ uint32_t disable_timeout_recovery : 1; /* 1 if timeout recovery should be disabled */ uint32_t ips_pg_disable: 1; /* 1 to disable ONO domains power gating*/ @@ -1308,6 +1308,7 @@ enum dmub_cmd_dpia_type { DMUB_CMD__DPIA_DIG1_DPIA_CONTROL = 0, DMUB_CMD__DPIA_SET_CONFIG_ACCESS = 1, DMUB_CMD__DPIA_MST_ALLOC_SLOTS = 2, + DMUB_CMD__DPIA_SET_TPS_NOTIFICATION = 3, }; /* DMUB_OUT_CMD__DPIA_NOTIFICATION command types. */ @@ -2138,6 +2139,24 @@ struct dmub_rb_cmd_set_mst_alloc_slots { struct dmub_cmd_mst_alloc_slots_control_data mst_slots_control; /* mst slots control */ }; +/** + * Data passed from driver to FW in a DMUB_CMD__SET_TPS_NOTIFICATION command. + */ +struct dmub_cmd_tps_notification_data { + uint8_t instance; /* DPIA instance */ + uint8_t tps; /* requested training pattern */ + uint8_t reserved1; + uint8_t reserved2; +}; + +/** + * DMUB command structure for SET_TPS_NOTIFICATION command. + */ +struct dmub_rb_cmd_set_tps_notification { + struct dmub_cmd_header header; /* header */ + struct dmub_cmd_tps_notification_data tps_notification; /* set tps_notification data */ +}; + /** * DMUB command structure for DPIA HPD int enable control. */ @@ -5304,6 +5323,10 @@ union dmub_rb_cmd { * Definition of a DMUB_CMD__DPIA_MST_ALLOC_SLOTS command. */ struct dmub_rb_cmd_set_mst_alloc_slots set_mst_alloc_slots; + /** + * Definition of a DMUB_CMD__DPIA_SET_TPS_NOTIFICATION command. + */ + struct dmub_rb_cmd_set_tps_notification set_tps_notification; /** * Definition of a DMUB_CMD__EDID_CEA command. */ diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn35.c b/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn35.c index 746696b6f09a8..2ccad79053c58 100644 --- a/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn35.c +++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn35.c @@ -425,6 +425,7 @@ void dmub_dcn35_enable_dmub_boot_options(struct dmub_srv *dmub, const struct dmu boot_options.bits.ips_disable = params->disable_ips; boot_options.bits.ips_sequential_ono = params->ips_sequential_ono; boot_options.bits.disable_sldo_opt = params->disable_sldo_opt; + boot_options.bits.enable_non_transparent_setconfig = params->enable_non_transparent_setconfig; REG_WRITE(DMCUB_SCRATCH14, boot_options.all); } From e79563bf5fb17d1a199c7c0f7d5a7a98c077302a Mon Sep 17 00:00:00 2001 From: Relja Vojvodic Date: Wed, 28 Aug 2024 11:42:26 -0400 Subject: [PATCH 220/352] drm/amd/display: Add fullscreen only sharpening policy [WHAT & HOW] Disable sharpening if not in fullscreen if this policy is selected Reviewed-by: Samson Tam Signed-off-by: Relja Vojvodic Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/spl/dc_spl.c | 3 +++ drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/spl/dc_spl.c b/drivers/gpu/drm/amd/display/dc/spl/dc_spl.c index f7a654b3a092f..014e8a296f0c7 100644 --- a/drivers/gpu/drm/amd/display/dc/spl/dc_spl.c +++ b/drivers/gpu/drm/amd/display/dc/spl/dc_spl.c @@ -853,6 +853,9 @@ static bool spl_get_isharp_en(struct spl_in *spl_in, else if ((spl_is_yuv420(spl_in->basic_in.format) && !fullscreen) && (spl_in->debug.sharpen_policy == SHARPEN_RGB_FULLSCREEN_YUV)) return enable_isharp; + else if (!spl_in->is_fullscreen && + spl_in->debug.sharpen_policy == SHARPEN_FULLSCREEN_ALL) + return enable_isharp; /* * Apply sharpness if supports horizontal taps 4,6 AND diff --git a/drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h b/drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h index 425d4a282c7a7..2a74ff5fdfdbc 100644 --- a/drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h +++ b/drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h @@ -490,7 +490,8 @@ enum linear_light_scaling { // convert it in translation logic enum sharpen_policy { SHARPEN_ALWAYS = 0, SHARPEN_YUV = 1, - SHARPEN_RGB_FULLSCREEN_YUV = 2 + SHARPEN_RGB_FULLSCREEN_YUV = 2, + SHARPEN_FULLSCREEN_ALL = 3 }; enum scale_to_sharpness_policy { NO_SCALE_TO_SHARPNESS_ADJ = 0, From 07bfa9cdbf3cd2daadfaaba0601f126f45951ffa Mon Sep 17 00:00:00 2001 From: Leo Ma Date: Mon, 19 Aug 2024 13:25:27 -0400 Subject: [PATCH 221/352] drm/amd/display: Add HDMI DSC native YCbCr422 support [WHY && HOW] For some HDMI OVT timing, YCbCr422 encoding fails at the DSC bandwidth check. The root cause is our DSC policy for timing doesn't account for HDMI YCbCr422 native support. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Chris Park Signed-off-by: Leo Ma Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c | 4 ++-- drivers/gpu/drm/amd/display/dc/dc_dsc.h | 3 ++- drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c | 5 +++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index c0c61c03984c5..83a31b97e96bf 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -1147,7 +1147,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, params[count].num_slices_v = aconnector->dsc_settings.dsc_num_slices_v; params[count].bpp_overwrite = aconnector->dsc_settings.dsc_bits_per_pixel; params[count].compression_possible = stream->sink->dsc_caps.dsc_dec_caps.is_dsc_supported; - dc_dsc_get_policy_for_timing(params[count].timing, 0, &dsc_policy); + dc_dsc_get_policy_for_timing(params[count].timing, 0, &dsc_policy, dc_link_get_highest_encoding_format(stream->link)); if (!dc_dsc_compute_bandwidth_range( stream->sink->ctx->dc->res_pool->dscs[0], stream->sink->ctx->dc->debug.dsc_min_slice_height_override, @@ -1681,7 +1681,7 @@ static bool is_dsc_common_config_possible(struct dc_stream_state *stream, { struct dc_dsc_policy dsc_policy = {0}; - dc_dsc_get_policy_for_timing(&stream->timing, 0, &dsc_policy); + dc_dsc_get_policy_for_timing(&stream->timing, 0, &dsc_policy, dc_link_get_highest_encoding_format(stream->link)); dc_dsc_compute_bandwidth_range(stream->sink->ctx->dc->res_pool->dscs[0], stream->sink->ctx->dc->debug.dsc_min_slice_height_override, dsc_policy.min_target_bpp * 16, diff --git a/drivers/gpu/drm/amd/display/dc/dc_dsc.h b/drivers/gpu/drm/amd/display/dc/dc_dsc.h index 2a5120ecf48be..9014c24098178 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_dsc.h +++ b/drivers/gpu/drm/amd/display/dc/dc_dsc.h @@ -101,7 +101,8 @@ uint32_t dc_dsc_stream_bandwidth_overhead_in_kbps( */ void dc_dsc_get_policy_for_timing(const struct dc_crtc_timing *timing, uint32_t max_target_bpp_limit_override_x16, - struct dc_dsc_policy *policy); + struct dc_dsc_policy *policy, + const enum dc_link_encoding_format link_encoding); void dc_dsc_policy_set_max_target_bpp_limit(uint32_t limit); diff --git a/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c b/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c index 79c426425911f..ebd5df1a36e8b 100644 --- a/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c +++ b/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c @@ -883,7 +883,7 @@ static bool setup_dsc_config( memset(dsc_cfg, 0, sizeof(struct dc_dsc_config)); - dc_dsc_get_policy_for_timing(timing, options->max_target_bpp_limit_override_x16, &policy); + dc_dsc_get_policy_for_timing(timing, options->max_target_bpp_limit_override_x16, &policy, link_encoding); pic_width = timing->h_addressable + timing->h_border_left + timing->h_border_right; pic_height = timing->v_addressable + timing->v_border_top + timing->v_border_bottom; @@ -1173,7 +1173,8 @@ uint32_t dc_dsc_stream_bandwidth_overhead_in_kbps( void dc_dsc_get_policy_for_timing(const struct dc_crtc_timing *timing, uint32_t max_target_bpp_limit_override_x16, - struct dc_dsc_policy *policy) + struct dc_dsc_policy *policy, + const enum dc_link_encoding_format link_encoding) { uint32_t bpc = 0; From ce83ae29f93772d604b4ea73459fb17822d6a6b0 Mon Sep 17 00:00:00 2001 From: Aric Cyr Date: Tue, 3 Sep 2024 08:45:48 -0400 Subject: [PATCH 222/352] drm/amd/display: 3.2.300 - Add HDMI DSC native YCbCr422 support - Add fullscreen only sharpening policy - Restructure dpia link training - Disable SYMCLK32_LE root clock gating - Clean up dsc blocks in accelerated mode - Block dynamic IPS2 on DCN35 for incompatible FW versions - Add debug options to change sharpen policies - Block timing sync for different output formats in pmo - Enable DML2 override_det_buffer_size_kbytes - Add dmub hpd sense callback - Emulate Display Hotplug Hang - Implement new DPCD register handling - Use SDR white level to calculate matrix coefficients - Round calculated vtotal Reviewed-by: Alex Hung Signed-off-by: Aric Cyr Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 133cac4d9fc4f..e659f4fed19fc 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -55,7 +55,7 @@ struct aux_payload; struct set_config_cmd_payload; struct dmub_notification; -#define DC_VER "3.2.299" +#define DC_VER "3.2.300" #define MAX_SURFACES 3 #define MAX_PLANES 6 From ff599ef6970ee000fa5bc38d02fa5ff5f3fc7575 Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Thu, 29 Aug 2024 17:30:26 -0600 Subject: [PATCH 223/352] drm/amd/display: Check null pointer before dereferencing se [WHAT & HOW] se is null checked previously in the same function, indicating it might be null; therefore, it must be checked when used again. This fixes 1 FORWARD_NULL issue reported by Coverity. Acked-by: Alex Hung Reviewed-by: Rodrigo Siqueira Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index eb7c7681bdd9e..67812fbbb006a 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -1767,7 +1767,7 @@ bool dc_validate_boot_timing(const struct dc *dc, if (crtc_timing->pix_clk_100hz != pix_clk_100hz) return false; - if (!se->funcs->dp_get_pixel_format) + if (!se || !se->funcs->dp_get_pixel_format) return false; if (!se->funcs->dp_get_pixel_format( From b6499840cafca25175f43ebd601913bf31d06f16 Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Thu, 29 Aug 2024 16:35:51 -0600 Subject: [PATCH 224/352] drm/amd/display: Remove always-false branches [WHAT & HOW] req128_c is always set to false and its branch is never taken. Similarly, MacroTileSizeBytes is set to either 256 or 65535 and it is never 4096 and it's branch is not taken. Therefore, their branches are removed. This fixes 3 DEADCODE issues reported by Coverity. Acked-by: Alex Hung Reviewed-by: Alvin Lee Reviewed-by: Rodrigo Siqueira Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../amd/display/dc/dml/dcn20/display_rq_dlg_calc_20.c | 3 --- .../amd/display/dc/dml/dcn20/display_rq_dlg_calc_20v2.c | 3 --- .../drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c | 9 --------- 3 files changed, 15 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20.c b/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20.c index e7019c95ba79e..4fce64a030b60 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20.c @@ -313,9 +313,6 @@ static void handle_det_buf_split(struct display_mode_lib *mode_lib, if (swath_height_c > 0) log2_swath_height_c = dml_log2(swath_height_c); - - if (req128_c && log2_swath_height_c > 0) - log2_swath_height_c -= 1; } rq_param->dlg.rq_l.swath_height = 1 << log2_swath_height_l; diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20v2.c b/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20v2.c index ae52510417280..3fa9a5da02f6a 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20v2.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20v2.c @@ -313,9 +313,6 @@ static void handle_det_buf_split(struct display_mode_lib *mode_lib, if (swath_height_c > 0) log2_swath_height_c = dml_log2(swath_height_c); - - if (req128_c && log2_swath_height_c > 0) - log2_swath_height_c -= 1; } rq_param->dlg.rq_l.swath_height = 1 << log2_swath_height_l; diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c index 0b132ce1d2cdc..2b275e6803797 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c @@ -1924,15 +1924,6 @@ static unsigned int CalculateVMAndRowBytes( *PixelPTEReqWidth = 32768.0 / BytePerPixel; *PTERequestSize = 64; FractionOfPTEReturnDrop = 0; - } else if (MacroTileSizeBytes == 4096) { - PixelPTEReqHeightPTEs = 1; - *PixelPTEReqHeight = MacroTileHeight; - *PixelPTEReqWidth = 8 * *MacroTileWidth; - *PTERequestSize = 64; - if (ScanDirection != dm_vert) - FractionOfPTEReturnDrop = 0; - else - FractionOfPTEReturnDrop = 7.0 / 8; } else if (GPUVMMinPageSize == 4 && MacroTileSizeBytes > 4096) { PixelPTEReqHeightPTEs = 16; *PixelPTEReqHeight = 16 * BlockHeight256Bytes; From f510dd5c210bf8cc22e4be48cbbda3cb754219f5 Mon Sep 17 00:00:00 2001 From: Aurabindo Pillai Date: Tue, 3 Sep 2024 10:10:44 -0400 Subject: [PATCH 225/352] drm/amd/display: Fix underflow when setting underscan on DCN401 [WHY & HOW] When underscan is set through xrandr, it causes the stream destination rect to change in a way it becomes complicated to handle the calculations for subvp. Since this is a corner case, disable subvp when underscan is set. Fix the existing check that is supposed to catch this corner case by adding a check based on the parameters in the stream Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Dillon Varone Reviewed-by: Rodrigo Siqueira Signed-off-by: Aurabindo Pillai Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c index b0d9aed0f2657..8697eac1e1f7e 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c @@ -858,7 +858,9 @@ static void populate_dml21_plane_config_from_plane_state(struct dml2_context *dm plane->immediate_flip = plane_state->flip_immediate; - plane->composition.rect_out_height_spans_vactive = plane_state->dst_rect.height >= stream->timing.v_addressable; + plane->composition.rect_out_height_spans_vactive = + plane_state->dst_rect.height >= stream->timing.v_addressable && + stream->dst.height >= stream->timing.v_addressable; } //TODO : Could be possibly moved to a common helper layer. From 4bdc5b504af7de1f649004cfdd37445d36db6703 Mon Sep 17 00:00:00 2001 From: Zhikai Zhai Date: Tue, 27 Aug 2024 14:06:01 +0800 Subject: [PATCH 226/352] drm/amd/display: Skip to enable dsc if it has been off [WHY] It makes DSC enable when we commit the stream which need keep power off, and then it will skip to disable DSC if pipe reset at this situation as power has been off. It may cause the DSC unexpected enable on the pipe with the next new stream which doesn't support DSC. [HOW] Check the DSC used on current pipe status when update stream. Skip to enable if it has been off. The operation enable DSC should happen when set power on. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Wenjing Liu Signed-off-by: Zhikai Zhai Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c | 14 ++++++++++++++ .../drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c | 13 +++++++++++++ 2 files changed, 27 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c index a36e11606f90e..2e8c9f7382596 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c @@ -1032,6 +1032,20 @@ void dcn32_update_dsc_on_stream(struct pipe_ctx *pipe_ctx, bool enable) struct dsc_config dsc_cfg; struct dsc_optc_config dsc_optc_cfg = {0}; enum optc_dsc_mode optc_dsc_mode; + struct dcn_dsc_state dsc_state = {0}; + + if (!dsc) { + DC_LOG_DSC("DSC is NULL for tg instance %d:", pipe_ctx->stream_res.tg->inst); + return; + } + + if (dsc->funcs->dsc_read_state) { + dsc->funcs->dsc_read_state(dsc, &dsc_state); + if (!dsc_state.dsc_fw_en) { + DC_LOG_DSC("DSC has been disabled for tg instance %d:", pipe_ctx->stream_res.tg->inst); + return; + } + } /* Enable DSC hw block */ dsc_cfg.pic_width = (stream->timing.h_addressable + stream->timing.h_border_left + stream->timing.h_border_right) / opp_cnt; diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c index 479fd3e89e5ab..bd309dbdf7b2a 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c @@ -334,7 +334,20 @@ static void update_dsc_on_stream(struct pipe_ctx *pipe_ctx, bool enable) struct dsc_config dsc_cfg; struct dsc_optc_config dsc_optc_cfg = {0}; enum optc_dsc_mode optc_dsc_mode; + struct dcn_dsc_state dsc_state = {0}; + if (!dsc) { + DC_LOG_DSC("DSC is NULL for tg instance %d:", pipe_ctx->stream_res.tg->inst); + return; + } + + if (dsc->funcs->dsc_read_state) { + dsc->funcs->dsc_read_state(dsc, &dsc_state); + if (!dsc_state.dsc_fw_en) { + DC_LOG_DSC("DSC has been disabled for tg instance %d:", pipe_ctx->stream_res.tg->inst); + return; + } + } /* Enable DSC hw block */ dsc_cfg.pic_width = (stream->timing.h_addressable + stream->timing.h_border_left + stream->timing.h_border_right) / opp_cnt; dsc_cfg.pic_height = stream->timing.v_addressable + stream->timing.v_border_top + stream->timing.v_border_bottom; From b74571a83fd3e50f804f090aae60c864d458187c Mon Sep 17 00:00:00 2001 From: Charlene Liu Date: Wed, 4 Sep 2024 15:58:25 -0400 Subject: [PATCH 227/352] drm/amd/display: Use full update for swizzle mode change [WHY & HOW] 1) We did linear/non linear transition properly long ago 2) We used that path to handle SystemDisplayEnable 3) We fixed a SystemDisplayEnable inability to fallback to passive by impacting the transition flow generically 4) AFMF later relied on the generic transition behavior Separating the two flows to make (3) non-generic is the best immediate coarse of action. DC can discern SSAMPO3 very easily from SDE. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Chris Park Signed-off-by: Charlene Liu Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 6 +++--- drivers/gpu/drm/amd/display/dc/dc.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 67812fbbb006a..a1652130e4be4 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -2376,7 +2376,7 @@ static bool is_surface_in_context( return false; } -static enum surface_update_type get_plane_info_update_type(const struct dc_surface_update *u) +static enum surface_update_type get_plane_info_update_type(const struct dc *dc, const struct dc_surface_update *u) { union surface_update_flags *update_flags = &u->surface->update_flags; enum surface_update_type update_type = UPDATE_TYPE_FAST; @@ -2455,7 +2455,7 @@ static enum surface_update_type get_plane_info_update_type(const struct dc_surfa /* todo: below are HW dependent, we should add a hook to * DCE/N resource and validated there. */ - if (u->plane_info->tiling_info.gfx9.swizzle != DC_SW_LINEAR) { + if (!dc->debug.skip_full_updated_if_possible) { /* swizzled mode requires RQ to be setup properly, * thus need to run DML to calculate RQ settings */ @@ -2547,7 +2547,7 @@ static enum surface_update_type det_surface_update(const struct dc *dc, update_flags->raw = 0; // Reset all flags - type = get_plane_info_update_type(u); + type = get_plane_info_update_type(dc, u); elevate_update_type(&overall_type, type); type = get_scaling_info_update_type(dc, u); diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index e659f4fed19fc..78ebe636389ec 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -1060,6 +1060,7 @@ struct dc_debug_options { bool enable_ips_visual_confirm; unsigned int sharpen_policy; unsigned int scale_to_sharpness_policy; + bool skip_full_updated_if_possible; }; From 327e62f47eb57ae5ff63de82b0815557104e439a Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 13 Sep 2024 13:00:39 -0500 Subject: [PATCH 228/352] drm/amd/display: Validate backlight caps are sane Currently amdgpu takes backlight caps provided by the ACPI tables on systems as is. If the firmware sets maximums that are too low this means that users don't get a good experience. To avoid having to maintain a quirk list of such systems, do a sanity check on the values. Check that the spread is at least half of the values that amdgpu would use if no ACPI table was found and if not use the amdgpu defaults. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3020 Reviewed-by: Harry Wentland Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 4ab329e007147..08bcb47d70743 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -4469,6 +4469,7 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev) #define AMDGPU_DM_DEFAULT_MIN_BACKLIGHT 12 #define AMDGPU_DM_DEFAULT_MAX_BACKLIGHT 255 +#define AMDGPU_DM_MIN_SPREAD ((AMDGPU_DM_DEFAULT_MAX_BACKLIGHT - AMDGPU_DM_DEFAULT_MIN_BACKLIGHT) / 2) #define AUX_BL_DEFAULT_TRANSITION_TIME_MS 50 static void amdgpu_dm_update_backlight_caps(struct amdgpu_display_manager *dm, @@ -4483,6 +4484,21 @@ static void amdgpu_dm_update_backlight_caps(struct amdgpu_display_manager *dm, return; amdgpu_acpi_get_backlight_caps(&caps); + + /* validate the firmware value is sane */ + if (caps.caps_valid) { + int spread = caps.max_input_signal - caps.min_input_signal; + + if (caps.max_input_signal > AMDGPU_DM_DEFAULT_MAX_BACKLIGHT || + caps.min_input_signal < AMDGPU_DM_DEFAULT_MIN_BACKLIGHT || + spread > AMDGPU_DM_DEFAULT_MAX_BACKLIGHT || + spread < AMDGPU_DM_MIN_SPREAD) { + DRM_DEBUG_KMS("DM: Invalid backlight caps: min=%d, max=%d\n", + caps.min_input_signal, caps.max_input_signal); + caps.caps_valid = false; + } + } + if (caps.caps_valid) { dm->backlight_caps[bl_idx].caps_valid = true; if (caps.aux_support) From 199888aa25b3a3315360224bda9134a9b58c9306 Mon Sep 17 00:00:00 2001 From: Roman Li Date: Thu, 5 Sep 2024 14:22:30 -0400 Subject: [PATCH 229/352] drm/amd/display: Update IPS default mode for DCN35/DCN351 [WHY] RCG state of IPX in idle is more stable for DCN351 and some variants of DCN35 than IPS2. [HOW] Rework dm_get_default_ips_mode() to specify default per ASIC and update DCN35/DCN351 defaults accordingly. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Sun peng Li Signed-off-by: Roman Li Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 50 ++++++++++++------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 08bcb47d70743..94c8d9966ec38 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1771,25 +1771,41 @@ static struct dml2_soc_bb *dm_dmub_get_vbios_bounding_box(struct amdgpu_device * static enum dmub_ips_disable_type dm_get_default_ips_mode( struct amdgpu_device *adev) { - /* - * On DCN35 systems with Z8 enabled, it's possible for IPS2 + Z8 to - * cause a hard hang. A fix exists for newer PMFW. - * - * As a workaround, for non-fixed PMFW, force IPS1+RCG as the deepest - * IPS state in all cases, except for s0ix and all displays off (DPMS), - * where IPS2 is allowed. - * - * When checking pmfw version, use the major and minor only. - */ - if (amdgpu_ip_version(adev, DCE_HWIP, 0) == IP_VERSION(3, 5, 0) && - (adev->pm.fw_version & 0x00FFFF00) < 0x005D6300) - return DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF; + enum dmub_ips_disable_type ret = DMUB_IPS_ENABLE; - if (amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 5, 0)) - return DMUB_IPS_ENABLE; + switch (amdgpu_ip_version(adev, DCE_HWIP, 0)) { + case IP_VERSION(3, 5, 0): + /* + * On DCN35 systems with Z8 enabled, it's possible for IPS2 + Z8 to + * cause a hard hang. A fix exists for newer PMFW. + * + * As a workaround, for non-fixed PMFW, force IPS1+RCG as the deepest + * IPS state in all cases, except for s0ix and all displays off (DPMS), + * where IPS2 is allowed. + * + * When checking pmfw version, use the major and minor only. + */ + if ((adev->pm.fw_version & 0x00FFFF00) < 0x005D6300) + ret = DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF; + else if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(11, 5, 0)) + /* + * Other ASICs with DCN35 that have residency issues with + * IPS2 in idle. + * We want them to use IPS2 only in display off cases. + */ + ret = DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF; + break; + case IP_VERSION(3, 5, 1): + ret = DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF; + break; + default: + /* ASICs older than DCN35 do not have IPSs */ + if (amdgpu_ip_version(adev, DCE_HWIP, 0) < IP_VERSION(3, 5, 0)) + ret = DMUB_IPS_DISABLE_ALL; + break; + } - /* ASICs older than DCN35 do not have IPSs */ - return DMUB_IPS_DISABLE_ALL; + return ret; } static int amdgpu_dm_init(struct amdgpu_device *adev) From fa8a4d3659d0c1ad73d5f59b2e0a6d408de5b317 Mon Sep 17 00:00:00 2001 From: Charlene Liu Date: Thu, 5 Sep 2024 17:28:12 -0400 Subject: [PATCH 230/352] drm/amd/display: Clear cached watermark after resume [WHY] Driver could skip program watermarks when resume from S0i3/S4. [HOW] Clear the cached one first to make sure new value gets applied. Reviewed-by: Alvin Lee Reviewed-by: Roman Li Signed-off-by: Charlene Liu Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c b/drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c index 6293173ba2b9d..5eb3da8d5206e 100644 --- a/drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c +++ b/drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c @@ -545,6 +545,7 @@ static void hubbub35_init(struct hubbub *hubbub) DCHUBBUB_ARB_MAX_REQ_OUTSTAND, 256, DCHUBBUB_ARB_MIN_REQ_OUTSTAND, 256); + memset(&hubbub2->watermarks.a.cstate_pstate, 0, sizeof(hubbub2->watermarks.a.cstate_pstate)); } /*static void hubbub35_set_request_limit(struct hubbub *hubbub, From 06c9aeb57fe894e6e442cd66870cd3e863bbf08c Mon Sep 17 00:00:00 2001 From: Aric Cyr Date: Sun, 8 Sep 2024 21:40:21 -0400 Subject: [PATCH 231/352] drm/amd/display: 3.2.301 - Clear cached watermark after resume - Update IPS default mode for DCN35/DCN351 - Use full update for swizzle mode change - Skip to enable dsc if it has been off - Fix underflow when setting underscan on DCN401 - Remove always-false branches - Check null pointer before dereferencing se Acked-by: Alex Hung Signed-off-by: Aric Cyr Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 78ebe636389ec..3992ad73165bc 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -55,7 +55,7 @@ struct aux_payload; struct set_config_cmd_payload; struct dmub_notification; -#define DC_VER "3.2.300" +#define DC_VER "3.2.301" #define MAX_SURFACES 3 #define MAX_PLANES 6 From ef126c06a98bde1a41303970eb0fc0ac33c3cc02 Mon Sep 17 00:00:00 2001 From: Asad Kamal Date: Mon, 22 Jul 2024 19:45:11 +0800 Subject: [PATCH 232/352] drm/amdgpu: Fix get each xcp macro Fix get each xcp macro to loop over each partition correctly Fixes: 4bdca2057933 ("drm/amdgpu: Add utility functions for xcp") Signed-off-by: Asad Kamal Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h index 90138bc5f03d1..32775260556f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h @@ -180,6 +180,6 @@ amdgpu_get_next_xcp(struct amdgpu_xcp_mgr *xcp_mgr, int *from) #define for_each_xcp(xcp_mgr, xcp, i) \ for (i = 0, xcp = amdgpu_get_next_xcp(xcp_mgr, &i); xcp; \ - xcp = amdgpu_get_next_xcp(xcp_mgr, &i)) + ++i, xcp = amdgpu_get_next_xcp(xcp_mgr, &i)) #endif From 42ac749d5b8bf78b347ac8a52eb15cc397b157a0 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Wed, 11 Sep 2024 13:49:51 +0530 Subject: [PATCH 233/352] drm/amdgpu: Fix XCP instance mask calculation Fix instance mask calculation for VCN IP. There are cases where VCN instance could be shared across partitions. Fix here so that other blocks don't need to check for any shared instances based on partition mode. Signed-off-by: Lijo Lazar Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 32 +++++++++------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c index 26e2188101e7e..5e8833e4fed2a 100644 --- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c +++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c @@ -94,8 +94,6 @@ static void aqua_vanjaram_set_xcp_id(struct amdgpu_device *adev, case AMDGPU_RING_TYPE_VCN_ENC: case AMDGPU_RING_TYPE_VCN_JPEG: ip_blk = AMDGPU_XCP_VCN; - if (aqua_vanjaram_xcp_vcn_shared(adev)) - inst_mask = 1 << (inst_idx * 2); break; default: DRM_ERROR("Not support ring type %d!", ring->funcs->type); @@ -105,6 +103,8 @@ static void aqua_vanjaram_set_xcp_id(struct amdgpu_device *adev, for (xcp_id = 0; xcp_id < adev->xcp_mgr->num_xcps; xcp_id++) { if (adev->xcp_mgr->xcp[xcp_id].ip[ip_blk].inst_mask & inst_mask) { ring->xcp_id = xcp_id; + dev_dbg(adev->dev, "ring:%s xcp_id :%u", ring->name, + ring->xcp_id); if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) adev->gfx.enforce_isolation[xcp_id].xcp_id = xcp_id; break; @@ -394,38 +394,31 @@ static int __aqua_vanjaram_get_xcp_ip_info(struct amdgpu_xcp_mgr *xcp_mgr, int x struct amdgpu_xcp_ip *ip) { struct amdgpu_device *adev = xcp_mgr->adev; + int num_sdma, num_vcn, num_shared_vcn, num_xcp; int num_xcc_xcp, num_sdma_xcp, num_vcn_xcp; - int num_sdma, num_vcn; num_sdma = adev->sdma.num_instances; num_vcn = adev->vcn.num_vcn_inst; + num_shared_vcn = 1; + + num_xcc_xcp = adev->gfx.num_xcc_per_xcp; + num_xcp = NUM_XCC(adev->gfx.xcc_mask) / num_xcc_xcp; switch (xcp_mgr->mode) { case AMDGPU_SPX_PARTITION_MODE: - num_sdma_xcp = num_sdma; - num_vcn_xcp = num_vcn; - break; case AMDGPU_DPX_PARTITION_MODE: - num_sdma_xcp = num_sdma / 2; - num_vcn_xcp = num_vcn / 2; - break; case AMDGPU_TPX_PARTITION_MODE: - num_sdma_xcp = num_sdma / 3; - num_vcn_xcp = num_vcn / 3; - break; case AMDGPU_QPX_PARTITION_MODE: - num_sdma_xcp = num_sdma / 4; - num_vcn_xcp = num_vcn / 4; - break; case AMDGPU_CPX_PARTITION_MODE: - num_sdma_xcp = 2; - num_vcn_xcp = num_vcn ? 1 : 0; + num_sdma_xcp = DIV_ROUND_UP(num_sdma, num_xcp); + num_vcn_xcp = DIV_ROUND_UP(num_vcn, num_xcp); break; default: return -EINVAL; } - num_xcc_xcp = adev->gfx.num_xcc_per_xcp; + if (num_vcn && num_xcp > num_vcn) + num_shared_vcn = num_xcp / num_vcn; switch (ip_id) { case AMDGPU_XCP_GFXHUB: @@ -441,7 +434,8 @@ static int __aqua_vanjaram_get_xcp_ip_info(struct amdgpu_xcp_mgr *xcp_mgr, int x ip->ip_funcs = &sdma_v4_4_2_xcp_funcs; break; case AMDGPU_XCP_VCN: - ip->inst_mask = XCP_INST_MASK(num_vcn_xcp, xcp_id); + ip->inst_mask = + XCP_INST_MASK(num_vcn_xcp, xcp_id / num_shared_vcn); /* TODO : Assign IP funcs */ break; default: From 54b86443fd4437c051aefd3f462cfff4defd420c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 5 Jun 2024 16:26:22 +0200 Subject: [PATCH 234/352] drm/amdgpu: explicitely set the AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of having that in the amdgpu_bo_pin() function applied for all pinned BOs. Signed-off-by: Christian König Acked-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 1 + drivers/gpu/drm/amd/amdgpu/dce_v10_0.c | 2 ++ drivers/gpu/drm/amd/amdgpu/dce_v11_0.c | 2 ++ drivers/gpu/drm/amd/amdgpu/dce_v6_0.c | 2 ++ drivers/gpu/drm/amd/amdgpu/dce_v8_0.c | 2 ++ drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c | 1 + drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c | 1 + 9 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c index 046d4c4e02990..b119d27271c1a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c @@ -233,6 +233,7 @@ int amdgpu_display_crtc_page_flip_target(struct drm_crtc *crtc, } if (!adev->enable_virtual_display) { + new_abo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; r = amdgpu_bo_pin(new_abo, amdgpu_display_supported_domains(adev, new_abo->flags)); if (unlikely(r != 0)) { @@ -1759,6 +1760,7 @@ int amdgpu_display_resume_helper(struct amdgpu_device *adev) r = amdgpu_bo_reserve(aobj, true); if (r == 0) { + aobj->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); if (r != 0) dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index a987f671b1d53..d62df3b5a0149 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -942,7 +942,6 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, */ int amdgpu_bo_pin(struct amdgpu_bo *bo, u32 domain) { - bo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; return amdgpu_bo_pin_restricted(bo, domain, 0, 0); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c index e5f508d34ed83..d4c2afafbb73c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c @@ -338,6 +338,7 @@ static int amdgpu_vkms_prepare_fb(struct drm_plane *plane, else domain = AMDGPU_GEM_DOMAIN_VRAM; + rbo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; r = amdgpu_bo_pin(rbo, domain); if (unlikely(r != 0)) { if (r != -ERESTARTSYS) diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c index 742adbc460c9d..70c1399f738de 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c @@ -1881,6 +1881,7 @@ static int dce_v10_0_crtc_do_set_base(struct drm_crtc *crtc, return r; if (!atomic) { + abo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; r = amdgpu_bo_pin(abo, AMDGPU_GEM_DOMAIN_VRAM); if (unlikely(r != 0)) { amdgpu_bo_unreserve(abo); @@ -2401,6 +2402,7 @@ static int dce_v10_0_crtc_cursor_set2(struct drm_crtc *crtc, return ret; } + aobj->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; ret = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); amdgpu_bo_unreserve(aobj); if (ret) { diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c index 8d46ebadfa466..f154c24499c8a 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c @@ -1931,6 +1931,7 @@ static int dce_v11_0_crtc_do_set_base(struct drm_crtc *crtc, return r; if (!atomic) { + abo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; r = amdgpu_bo_pin(abo, AMDGPU_GEM_DOMAIN_VRAM); if (unlikely(r != 0)) { amdgpu_bo_unreserve(abo); @@ -2485,6 +2486,7 @@ static int dce_v11_0_crtc_cursor_set2(struct drm_crtc *crtc, return ret; } + aobj->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; ret = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); amdgpu_bo_unreserve(aobj); if (ret) { diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c index f08dc6a3886f1..a7fcb135827f8 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c @@ -1861,6 +1861,7 @@ static int dce_v6_0_crtc_do_set_base(struct drm_crtc *crtc, return r; if (!atomic) { + abo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; r = amdgpu_bo_pin(abo, AMDGPU_GEM_DOMAIN_VRAM); if (unlikely(r != 0)) { amdgpu_bo_unreserve(abo); @@ -2321,6 +2322,7 @@ static int dce_v6_0_crtc_cursor_set2(struct drm_crtc *crtc, return ret; } + aobj->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; ret = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); amdgpu_bo_unreserve(aobj); if (ret) { diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c index a6a3adf2ae134..77ac3f114d241 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c @@ -1828,6 +1828,7 @@ static int dce_v8_0_crtc_do_set_base(struct drm_crtc *crtc, return r; if (!atomic) { + abo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; r = amdgpu_bo_pin(abo, AMDGPU_GEM_DOMAIN_VRAM); if (unlikely(r != 0)) { amdgpu_bo_unreserve(abo); @@ -2320,6 +2321,7 @@ static int dce_v8_0_crtc_cursor_set2(struct drm_crtc *crtc, return ret; } + aobj->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; ret = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); amdgpu_bo_unreserve(aobj); if (ret) { diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c index 25f63b2e7a8e2..495e3cd70426d 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c @@ -961,6 +961,7 @@ static int amdgpu_dm_plane_helper_prepare_fb(struct drm_plane *plane, else domain = AMDGPU_GEM_DOMAIN_VRAM; + rbo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; r = amdgpu_bo_pin(rbo, domain); if (unlikely(r != 0)) { if (r != -ERESTARTSYS) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c index 08c494a7a21ba..0d5fefb0f5917 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c @@ -114,6 +114,7 @@ static int amdgpu_dm_wb_prepare_job(struct drm_writeback_connector *wb_connector domain = amdgpu_display_supported_domains(adev, rbo->flags); + rbo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; r = amdgpu_bo_pin(rbo, domain); if (unlikely(r != 0)) { if (r != -ERESTARTSYS) From f2be7b39e43893ab5361115de2b95e7c5c86f190 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 5 Jun 2024 16:34:49 +0200 Subject: [PATCH 235/352] drm/amdgpu: remove amdgpu_pin_restricted() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We haven't used the functionality to pin BOs in a certain range at all while the driver existed. Just nuke it. Signed-off-by: Christian König Acked-by: Lijo Lazar Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 56 ++----------------- drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 2 - 3 files changed, 6 insertions(+), 54 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 4afef5b46c7d5..ce5ca304dba93 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1499,7 +1499,7 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) } } - ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0); + ret = amdgpu_bo_pin(bo, domain); if (ret) pr_err("Error in Pinning BO to domain: %d\n", domain); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index d62df3b5a0149..5bff4249b3573 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -809,29 +809,22 @@ void amdgpu_bo_unref(struct amdgpu_bo **bo) } /** - * amdgpu_bo_pin_restricted - pin an &amdgpu_bo buffer object + * amdgpu_bo_pin - pin an &amdgpu_bo buffer object * @bo: &amdgpu_bo buffer object to be pinned * @domain: domain to be pinned to - * @min_offset: the start of requested address range - * @max_offset: the end of requested address range * - * Pins the buffer object according to requested domain and address range. If - * the memory is unbound gart memory, binds the pages into gart table. Adjusts - * pin_count and pin_size accordingly. + * Pins the buffer object according to requested domain. If the memory is + * unbound gart memory, binds the pages into gart table. Adjusts pin_count and + * pin_size accordingly. * * Pinning means to lock pages in memory along with keeping them at a fixed * offset. It is required when a buffer can not be moved, for example, when * a display buffer is being scanned out. * - * Compared with amdgpu_bo_pin(), this function gives more flexibility on - * where to pin a buffer if there are specific restrictions on where a buffer - * must be located. - * * Returns: * 0 for success or a negative error code on failure. */ -int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, - u64 min_offset, u64 max_offset) +int amdgpu_bo_pin(struct amdgpu_bo *bo, u32 domain) { struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); struct ttm_operation_ctx ctx = { false, false }; @@ -840,9 +833,6 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) return -EPERM; - if (WARN_ON_ONCE(min_offset > max_offset)) - return -EINVAL; - /* Check domain to be pinned to against preferred domains */ if (bo->preferred_domains & domain) domain = bo->preferred_domains & domain; @@ -868,14 +858,6 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, return -EINVAL; ttm_bo_pin(&bo->tbo); - - if (max_offset != 0) { - u64 domain_start = amdgpu_ttm_domain_start(adev, - mem_type); - WARN_ON_ONCE(max_offset < - (amdgpu_bo_gpu_offset(bo) - domain_start)); - } - return 0; } @@ -892,17 +874,6 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, bo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; amdgpu_bo_placement_from_domain(bo, domain); for (i = 0; i < bo->placement.num_placement; i++) { - unsigned int fpfn, lpfn; - - fpfn = min_offset >> PAGE_SHIFT; - lpfn = max_offset >> PAGE_SHIFT; - - if (fpfn > bo->placements[i].fpfn) - bo->placements[i].fpfn = fpfn; - if (!bo->placements[i].lpfn || - (lpfn && lpfn < bo->placements[i].lpfn)) - bo->placements[i].lpfn = lpfn; - if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && bo->placements[i].mem_type == TTM_PL_VRAM) bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; @@ -928,23 +899,6 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, return r; } -/** - * amdgpu_bo_pin - pin an &amdgpu_bo buffer object - * @bo: &amdgpu_bo buffer object to be pinned - * @domain: domain to be pinned to - * - * A simple wrapper to amdgpu_bo_pin_restricted(). - * Provides a simpler API for buffers that do not have any strict restrictions - * on where a buffer must be located. - * - * Returns: - * 0 for success or a negative error code on failure. - */ -int amdgpu_bo_pin(struct amdgpu_bo *bo, u32 domain) -{ - return amdgpu_bo_pin_restricted(bo, domain, 0, 0); -} - /** * amdgpu_bo_unpin - unpin an &amdgpu_bo buffer object * @bo: &amdgpu_bo buffer object to be unpinned diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h index e42c34a3289d5..717e47b46167a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h @@ -304,8 +304,6 @@ void amdgpu_bo_kunmap(struct amdgpu_bo *bo); struct amdgpu_bo *amdgpu_bo_ref(struct amdgpu_bo *bo); void amdgpu_bo_unref(struct amdgpu_bo **bo); int amdgpu_bo_pin(struct amdgpu_bo *bo, u32 domain); -int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, - u64 min_offset, u64 max_offset); void amdgpu_bo_unpin(struct amdgpu_bo *bo); int amdgpu_bo_init(struct amdgpu_device *adev); void amdgpu_bo_fini(struct amdgpu_device *adev); From 375b035f689735fd7a87ff31ccac3a42717252bf Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 13 Sep 2024 16:22:01 -0400 Subject: [PATCH 236/352] drm/amdgpu/bios: split vbios fetching between APU and dGPU We need some different logic for dGPUs and the APU path can be simplified because there are some methods which are never used on APUs. This also fixes a regression on some older APUs causing the driver to fetch the unpatched ROM image rather than the patched image. Fixes: 9c081c11c621 ("drm/amdgpu: Reorder to read EFI exported ROM first") Reviewed-by: George Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c | 47 +++++++++++++++++++++++- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c index 42e64bce661e4..e8f62d718167b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c @@ -414,7 +414,36 @@ static inline bool amdgpu_acpi_vfct_bios(struct amdgpu_device *adev) } #endif -bool amdgpu_get_bios(struct amdgpu_device *adev) +static bool amdgpu_get_bios_apu(struct amdgpu_device *adev) +{ + if (amdgpu_acpi_vfct_bios(adev)) { + dev_info(adev->dev, "Fetched VBIOS from VFCT\n"); + goto success; + } + + if (igp_read_bios_from_vram(adev)) { + dev_info(adev->dev, "Fetched VBIOS from VRAM BAR\n"); + goto success; + } + + if (amdgpu_read_bios(adev)) { + dev_info(adev->dev, "Fetched VBIOS from ROM BAR\n"); + goto success; + } + + if (amdgpu_read_platform_bios(adev)) { + dev_info(adev->dev, "Fetched VBIOS from platform\n"); + goto success; + } + + dev_err(adev->dev, "Unable to locate a BIOS ROM\n"); + return false; + +success: + return true; +} + +static bool amdgpu_get_bios_dgpu(struct amdgpu_device *adev) { if (amdgpu_atrm_get_bios(adev)) { dev_info(adev->dev, "Fetched VBIOS from ATRM\n"); @@ -455,10 +484,24 @@ bool amdgpu_get_bios(struct amdgpu_device *adev) return false; success: - adev->is_atom_fw = adev->asic_type >= CHIP_VEGA10; return true; } +bool amdgpu_get_bios(struct amdgpu_device *adev) +{ + bool found; + + if (adev->flags & AMD_IS_APU) + found = amdgpu_get_bios_apu(adev); + else + found = amdgpu_get_bios_dgpu(adev); + + if (found) + adev->is_atom_fw = adev->asic_type >= CHIP_VEGA10; + + return found; +} + /* helper function for soc15 and onwards to read bios from rom */ bool amdgpu_soc15_read_bios_from_rom(struct amdgpu_device *adev, u8 *bios, u32 length_bytes) From e7d4e1438533abe448813bdc45691f9c230aa307 Mon Sep 17 00:00:00 2001 From: Tobias Jakobi Date: Mon, 16 Sep 2024 14:54:05 +0200 Subject: [PATCH 237/352] drm/amd/display: handle nulled pipe context in DCE110's set_drr() As set_drr() is called from IRQ context, it can happen that the pipe context has been nulled by dc_state_destruct(). Apply the same protection here that is already present for dcn35_set_drr() and dcn10_set_drr(). I.e. fetch the tg pointer first (to avoid a race with dc_state_destruct()), and then check the local copy before using it. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3142 Fixes: 06ad7e164256 ("drm/amd/display: Destroy DC context while keeping DML and DML2") Acked-by: Alex Deucher Signed-off-by: Tobias Jakobi Signed-off-by: Alex Deucher --- .../amd/display/dc/hwss/dce110/dce110_hwseq.c | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c index aa7479b318982..4fbed0298adfa 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c @@ -2096,13 +2096,20 @@ static void set_drr(struct pipe_ctx **pipe_ctx, * as well. */ for (i = 0; i < num_pipes; i++) { - pipe_ctx[i]->stream_res.tg->funcs->set_drr( - pipe_ctx[i]->stream_res.tg, ¶ms); - - if (adjust.v_total_max != 0 && adjust.v_total_min != 0) - pipe_ctx[i]->stream_res.tg->funcs->set_static_screen_control( - pipe_ctx[i]->stream_res.tg, - event_triggers, num_frames); + /* dc_state_destruct() might null the stream resources, so fetch tg + * here first to avoid a race condition. The lifetime of the pointee + * itself (the timing_generator object) is not a problem here. + */ + struct timing_generator *tg = pipe_ctx[i]->stream_res.tg; + + if ((tg != NULL) && tg->funcs) { + if (tg->funcs->set_drr) + tg->funcs->set_drr(tg, ¶ms); + if (adjust.v_total_max != 0 && adjust.v_total_min != 0) + if (tg->funcs->set_static_screen_control) + tg->funcs->set_static_screen_control( + tg, event_triggers, num_frames); + } } } From 042658d17a54c9dc8c028986dfbde49f4aa01871 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 17 Sep 2024 08:53:21 -0400 Subject: [PATCH 238/352] drm/amdgpu: clean up vbios fetching code After splitting the logic between APU and dGPU, clean up some of the APU and dGPU specific logic that no longer applied. Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c index e8f62d718167b..46bf623919d7c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c @@ -284,10 +284,6 @@ static bool amdgpu_atrm_get_bios(struct amdgpu_device *adev) acpi_status status; bool found = false; - /* ATRM is for the discrete card only */ - if (adev->flags & AMD_IS_APU) - return false; - /* ATRM is for on-platform devices only */ if (dev_is_removable(&adev->pdev->dev)) return false; @@ -343,11 +339,8 @@ static inline bool amdgpu_atrm_get_bios(struct amdgpu_device *adev) static bool amdgpu_read_disabled_bios(struct amdgpu_device *adev) { - if (adev->flags & AMD_IS_APU) - return igp_read_bios_from_vram(adev); - else - return (!adev->asic_funcs || !adev->asic_funcs->read_disabled_bios) ? - false : amdgpu_asic_read_disabled_bios(adev); + return (!adev->asic_funcs || !adev->asic_funcs->read_disabled_bios) ? + false : amdgpu_asic_read_disabled_bios(adev); } #ifdef CONFIG_ACPI @@ -455,11 +448,6 @@ static bool amdgpu_get_bios_dgpu(struct amdgpu_device *adev) goto success; } - if (igp_read_bios_from_vram(adev)) { - dev_info(adev->dev, "Fetched VBIOS from VRAM BAR\n"); - goto success; - } - if (amdgpu_read_platform_bios(adev)) { dev_info(adev->dev, "Fetched VBIOS from platform\n"); goto success; From 7b6df1d73290961ff0a00fd0022f28dd19e37181 Mon Sep 17 00:00:00 2001 From: Frank Min Date: Wed, 4 Sep 2024 10:50:33 +0800 Subject: [PATCH 239/352] drm/amdgpu: update golden regs for gfx12 update golden regs for gfx12 Signed-off-by: Frank Min Reviewed-by: Likun Gao Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.11.x --- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index d1357c01eb391..47b47d21f4644 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -202,12 +202,16 @@ static const struct amdgpu_hwip_reg_entry gc_gfx_queue_reg_list_12[] = { SOC15_REG_ENTRY_STR(GC, 0, regCP_IB1_BUFSZ) }; -static const struct soc15_reg_golden golden_settings_gc_12_0[] = { +static const struct soc15_reg_golden golden_settings_gc_12_0_rev0[] = { SOC15_REG_GOLDEN_VALUE(GC, 0, regDB_MEM_CONFIG, 0x0000000f, 0x0000000f), SOC15_REG_GOLDEN_VALUE(GC, 0, regCB_HW_CONTROL_1, 0x03000000, 0x03000000), SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2C_CTRL5, 0x00000070, 0x00000020) }; +static const struct soc15_reg_golden golden_settings_gc_12_0[] = { + SOC15_REG_GOLDEN_VALUE(GC, 0, regDB_MEM_CONFIG, 0x00008000, 0x00008000), +}; + #define DEFAULT_SH_MEM_CONFIG \ ((SH_MEM_ADDRESS_MODE_64 << SH_MEM_CONFIG__ADDRESS_MODE__SHIFT) | \ (SH_MEM_ALIGNMENT_MODE_UNALIGNED << SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | \ @@ -3495,10 +3499,14 @@ static void gfx_v12_0_init_golden_registers(struct amdgpu_device *adev) switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(12, 0, 0): case IP_VERSION(12, 0, 1): + soc15_program_register_sequence(adev, + golden_settings_gc_12_0, + (const u32)ARRAY_SIZE(golden_settings_gc_12_0)); + if (adev->rev_id == 0) soc15_program_register_sequence(adev, - golden_settings_gc_12_0, - (const u32)ARRAY_SIZE(golden_settings_gc_12_0)); + golden_settings_gc_12_0_rev0, + (const u32)ARRAY_SIZE(golden_settings_gc_12_0_rev0)); break; default: break; From d5a29e6a61028887bb8480e7c4af3547d6f3862d Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Tue, 17 Sep 2024 18:39:07 +0530 Subject: [PATCH 240/352] drm/amd/display: Fix kdoc entry for 'tps' in 'dc_process_dmub_dpia_set_tps_notification' Correct the parameter descriptor for the function `dc_process_dmub_dpia_set_tps_notification` to match the actual parameters used. Fixes the below with gcc W=1: drivers/gpu/drm/amd/amdgpu/../display/dc/core/dc.c:5768: warning: Function parameter or struct member 'tps' not described in 'dc_process_dmub_dpia_set_tps_notification' drivers/gpu/drm/amd/amdgpu/../display/dc/core/dc.c:5768: warning: Excess function parameter 'ts' description in 'dc_process_dmub_dpia_set_tps_notification' Cc: Tom Chung Cc: Rodrigo Siqueira Cc: Roman Li Cc: Alex Hung Cc: Aurabindo Pillai Cc: Harry Wentland Cc: Hamza Mahfooz Signed-off-by: Srinivasan Shanmugam Reviewed-by: Tom Chung Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index a1652130e4be4..5c39390ecbd5c 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -5760,7 +5760,7 @@ enum dc_status dc_process_dmub_set_mst_slots(const struct dc *dc, * * @dc: [in] dc structure * @link_index: [in] link index - * @ts: [in] request tps + * @tps: [in] request tps * * Submits set_tps_notification command to dmub via inbox message */ From 87d749a6aab73d8069d0345afaa98297816cb220 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Sun, 15 Sep 2024 14:28:37 -0500 Subject: [PATCH 241/352] drm/amd/display: Allow backlight to go below `AMDGPU_DM_DEFAULT_MIN_BACKLIGHT` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The issue with panel power savings compatibility below `AMDGPU_DM_DEFAULT_MIN_BACKLIGHT` happens at `AMDGPU_DM_DEFAULT_MIN_BACKLIGHT` as well. That issue will be fixed separately, so don't prevent the backlight brightness from going that low. Cc: Harry Wentland Cc: Thomas Weißschuh Link: https://lore.kernel.org/amd-gfx/be04226a-a9e3-4a45-a83b-6d263c6557d8@t-8ch.de/T/#m400dee4e2fc61fe9470334d20a7c8c89c9aef44f Reviewed-by: Harry Wentland Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 94c8d9966ec38..6e79028c5d789 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -4506,7 +4506,7 @@ static void amdgpu_dm_update_backlight_caps(struct amdgpu_display_manager *dm, int spread = caps.max_input_signal - caps.min_input_signal; if (caps.max_input_signal > AMDGPU_DM_DEFAULT_MAX_BACKLIGHT || - caps.min_input_signal < AMDGPU_DM_DEFAULT_MIN_BACKLIGHT || + caps.min_input_signal < 0 || spread > AMDGPU_DM_DEFAULT_MAX_BACKLIGHT || spread < AMDGPU_DM_MIN_SPREAD) { DRM_DEBUG_KMS("DM: Invalid backlight caps: min=%d, max=%d\n", From 6dcba0975d39b30be65dd038fed69e1aa111c73a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 11 Jul 2024 14:39:43 +0200 Subject: [PATCH 242/352] drm/amdgpu: use GEM references instead of TTMs v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of a TTM reference grab a GEM reference whenever necessary. v2: fix typo in amdgpu_bo_unref pointed out by Vitaly, initialize the GEM funcs for kernel allocations as well. Signed-off-by: Christian König Reviewed-by: Daniel Vetter (v1) Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 13 +++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 8 +++----- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 0e617dff8765e..1a5df8b946616 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -43,8 +43,6 @@ #include "amdgpu_hmm.h" #include "amdgpu_xgmi.h" -static const struct drm_gem_object_funcs amdgpu_gem_object_funcs; - static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf) { struct ttm_buffer_object *bo = vmf->vma->vm_private_data; @@ -87,11 +85,11 @@ static const struct vm_operations_struct amdgpu_gem_vm_ops = { static void amdgpu_gem_object_free(struct drm_gem_object *gobj) { - struct amdgpu_bo *robj = gem_to_amdgpu_bo(gobj); + struct amdgpu_bo *aobj = gem_to_amdgpu_bo(gobj); - if (robj) { - amdgpu_hmm_unregister(robj); - amdgpu_bo_unref(&robj); + if (aobj) { + amdgpu_hmm_unregister(aobj); + ttm_bo_put(&aobj->tbo); } } @@ -126,7 +124,6 @@ int amdgpu_gem_object_create(struct amdgpu_device *adev, unsigned long size, bo = &ubo->bo; *obj = &bo->tbo.base; - (*obj)->funcs = &amdgpu_gem_object_funcs; return 0; } @@ -295,7 +292,7 @@ static int amdgpu_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_str return drm_gem_ttm_mmap(obj, vma); } -static const struct drm_gem_object_funcs amdgpu_gem_object_funcs = { +const struct drm_gem_object_funcs amdgpu_gem_object_funcs = { .free = amdgpu_gem_object_free, .open = amdgpu_gem_object_open, .close = amdgpu_gem_object_close, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h index f30264782ba27..3a8f57900a3aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h @@ -33,6 +33,8 @@ #define AMDGPU_GEM_DOMAIN_MAX 0x3 #define gem_to_amdgpu_bo(gobj) container_of((gobj), struct amdgpu_bo, tbo.base) +extern const struct drm_gem_object_funcs amdgpu_gem_object_funcs; + unsigned long amdgpu_gem_timeout(uint64_t timeout_ns); /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 5bff4249b3573..44819cdba7fbe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -564,6 +564,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev, if (bo == NULL) return -ENOMEM; drm_gem_private_object_init(adev_to_drm(adev), &bo->tbo.base, size); + bo->tbo.base.funcs = &amdgpu_gem_object_funcs; bo->vm_bo = NULL; bo->preferred_domains = bp->preferred_domain ? bp->preferred_domain : bp->domain; @@ -786,7 +787,7 @@ struct amdgpu_bo *amdgpu_bo_ref(struct amdgpu_bo *bo) if (bo == NULL) return NULL; - ttm_bo_get(&bo->tbo); + drm_gem_object_get(&bo->tbo.base); return bo; } @@ -798,13 +799,10 @@ struct amdgpu_bo *amdgpu_bo_ref(struct amdgpu_bo *bo) */ void amdgpu_bo_unref(struct amdgpu_bo **bo) { - struct ttm_buffer_object *tbo; - if ((*bo) == NULL) return; - tbo = &((*bo)->tbo); - ttm_bo_put(tbo); + drm_gem_object_put(&(*bo)->tbo.base); *bo = NULL; } From 856265caa94a3c78feaa23ec1acd799fe1989201 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 16 Sep 2024 10:52:24 -0400 Subject: [PATCH 243/352] drm/amdgpu/mes11: reduce timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The firmware timeout is 2s. Reduce the driver timeout to 2.1 seconds to avoid back pressure on queue submissions. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3627 Fixes: f7c161a4c250 ("drm/amdgpu: increase mes submission timeout") Acked-by: Christian König Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index ee91ff9e52a20..231a3d490ea8e 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -161,7 +161,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, int api_status_off) { union MESAPI__QUERY_MES_STATUS mes_status_pkt; - signed long timeout = 3000000; /* 3000 ms */ + signed long timeout = 2100000; /* 2100 ms */ struct amdgpu_device *adev = mes->adev; struct amdgpu_ring *ring = &mes->ring[0]; struct MES_API_STATUS *api_status; From 84f76408abe989809de19d02e476b044fd985adc Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 18 Sep 2024 09:37:31 -0400 Subject: [PATCH 244/352] drm/amdgpu/mes12: reduce timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The firmware timeout is 2s. Reduce the driver timeout to 2.1 seconds to avoid back pressure on queue submissions. Fixes: 94b51a3d01ed ("drm/amdgpu/mes12: increase mes submission timeout") Acked-by: Christian König Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.11.x --- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index ef05a41162306..186f778133974 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -146,7 +146,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, int api_status_off) { union MESAPI__QUERY_MES_STATUS mes_status_pkt; - signed long timeout = 3000000; /* 3000 ms */ + signed long timeout = 2100000; /* 2100 ms */ struct amdgpu_device *adev = mes->adev; struct amdgpu_ring *ring = &mes->ring[pipe]; spinlock_t *ring_lock = &mes->ring_lock[pipe]; From 289ebd9afeb94862d96c89217068943f1937df5b Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 19 Sep 2024 09:22:57 +0900 Subject: [PATCH 245/352] ksmbd: fix warning: comparison of distinct pointer types lacks a cast MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit smb2pdu.c: In function ‘smb2_open’: ./include/linux/minmax.h:20:28: warning: comparison of distinct pointer types lacks a cast 20 | (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) | ^~ ./include/linux/minmax.h:26:4: note: in expansion of macro ‘__typecheck’ 26 | (__typecheck(x, y) && __no_side_effects(x, y)) | ^~~~~~~~~~~ ./include/linux/minmax.h:36:24: note: in expansion of macro ‘__safe_cmp’ 36 | __builtin_choose_expr(__safe_cmp(x, y), \ | ^~~~~~~~~~ ./include/linux/minmax.h:45:19: note: in expansion of macro ‘__careful_cmp’ 45 | #define min(x, y) __careful_cmp(x, y, <) | ^~~~~~~~~~~~~ /home/linkinjeon/git/smbd_work/ksmbd/smb2pdu.c:3713:27: note: in expansion of macro ‘min’ 3713 | fp->durable_timeout = min(dh_info.timeout, Fixes: c8efcc786146 ("ksmbd: add support for durable handles v1/v2") Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 5 +++-- fs/smb/server/vfs_cache.h | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index e6bdc1b207272..28cd66fa8e05f 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -3531,8 +3531,9 @@ int smb2_open(struct ksmbd_work *work) memcpy(fp->create_guid, dh_info.CreateGuid, SMB2_CREATE_GUID_SIZE); if (dh_info.timeout) - fp->durable_timeout = min(dh_info.timeout, - DURABLE_HANDLE_MAX_TIMEOUT); + fp->durable_timeout = + min_t(unsigned int, dh_info.timeout, + DURABLE_HANDLE_MAX_TIMEOUT); else fp->durable_timeout = 60; } diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h index b0f6d0f94cb8d..5bbb179736c29 100644 --- a/fs/smb/server/vfs_cache.h +++ b/fs/smb/server/vfs_cache.h @@ -100,8 +100,8 @@ struct ksmbd_file { struct list_head blocked_works; struct list_head lock_list; - int durable_timeout; - int durable_scavenger_timeout; + unsigned int durable_timeout; + unsigned int durable_scavenger_timeout; /* if ls is happening on directory, below is valid*/ struct ksmbd_readdir_data readdir_data; From 8bb04fb2b7e346ffcf00164b91129ab151ceaa46 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 20 Aug 2024 21:15:20 +0200 Subject: [PATCH 246/352] ksmbd: Replace one-element arrays with flexible-array members Replace the deprecated one-element arrays with flexible-array members in the structs copychunk_ioctl_req and smb2_ea_info_req. There are no binary differences after this conversion. Link: https://github.com/KSPP/linux/issues/79 Signed-off-by: Thorsten Blum Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 4 ++-- fs/smb/server/smb2pdu.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 28cd66fa8e05f..7121266daa020 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -4587,7 +4587,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, path = &fp->filp->f_path; /* single EA entry is requested with given user.* name */ if (req->InputBufferLength) { - if (le32_to_cpu(req->InputBufferLength) < + if (le32_to_cpu(req->InputBufferLength) <= sizeof(struct smb2_ea_info_req)) return -EINVAL; @@ -8091,7 +8091,7 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; } - if (in_buf_len < sizeof(struct copychunk_ioctl_req)) { + if (in_buf_len <= sizeof(struct copychunk_ioctl_req)) { ret = -EINVAL; goto out; } diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h index 3be7d5ae65a80..73aff20e22d01 100644 --- a/fs/smb/server/smb2pdu.h +++ b/fs/smb/server/smb2pdu.h @@ -194,7 +194,7 @@ struct copychunk_ioctl_req { __le64 ResumeKey[3]; __le32 ChunkCount; __le32 Reserved; - __u8 Chunks[1]; /* array of srv_copychunk */ + __u8 Chunks[]; /* array of srv_copychunk */ } __packed; struct srv_copychunk { @@ -370,7 +370,7 @@ struct smb2_file_attr_tag_info { struct smb2_ea_info_req { __le32 NextEntryOffset; __u8 EaNameLength; - char name[1]; + char name[]; } __packed; /* level 15 Query */ struct smb2_ea_info { From 39c3aad43f6f9bcddd660f5874dcd760e8c04a94 Mon Sep 17 00:00:00 2001 From: Ahmed Ehab Date: Sun, 22 Sep 2024 00:00:36 +0300 Subject: [PATCH 247/352] bcachefs: Hold read lock in bch2_snapshot_tree_oldest_subvol() Syzbot reports a problem that a warning is triggered due to suspicious use of rcu_dereference_check(). That is triggered by a call of bch2_snapshot_tree_oldest_subvol(). The cause of the warning is that inside bch2_snapshot_tree_oldest_subvol(), snapshot_t() is called which calls rcu_dereference() that requires a read lock to be held. Also, the call of bch2_snapshot_tree_next() eventually calls snapshot_t(). To fix this, call rcu_read_lock() before calling snapshot_t(). Then, release the lock after the termination of the while loop. Reported-by: Signed-off-by: Ahmed Ehab Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 8b18a9b483a4d..dff83ebbd912c 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -469,6 +469,7 @@ static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) u32 id = snapshot_root; u32 subvol = 0, s; + rcu_read_lock(); while (id) { s = snapshot_t(c, id)->subvol; @@ -477,6 +478,7 @@ static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) id = bch2_snapshot_tree_next(c, id); } + rcu_read_unlock(); return subvol; } From 0e4ed48292c55eeb0afab22f8930b556f17eaad2 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 29 Aug 2024 15:58:53 +0200 Subject: [PATCH 248/352] mailbox: ARM_MHU_V3 should depend on ARM64 The ARM MHUv3 controller is only present on ARM64 SoCs. Hence add a dependency on ARM64, to prevent asking the user about this driver when configuring a kernel for a different architecture than ARM64. Fixes: ca1a8680b134b5e6 ("mailbox: arm_mhuv3: Add driver") Signed-off-by: Geert Uytterhoeven Acked-by: Sudeep Holla Signed-off-by: Jassi Brar --- drivers/mailbox/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig index 4eed972959279..cbd9206cd7de3 100644 --- a/drivers/mailbox/Kconfig +++ b/drivers/mailbox/Kconfig @@ -25,6 +25,7 @@ config ARM_MHU_V2 config ARM_MHU_V3 tristate "ARM MHUv3 Mailbox" + depends on ARM64 || COMPILE_TEST depends on HAS_IOMEM || COMPILE_TEST depends on OF help From 39d7d6177f0cc25a567a4b3d2b2323489d4615f7 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 29 Jul 2024 15:47:09 +0800 Subject: [PATCH 249/352] mailbox: imx: use device name in interrupt name There are several MUs for different usage, SCMI MU, ELE MU, RemotePROC MU. Using "imx_mu_chan" in interrupt name would be hard to identify which MU triggers interrupt, so use device name to make it easy to know which MU triggers which interrupt. Signed-off-by: Peng Fan Signed-off-by: Jassi Brar --- drivers/mailbox/imx-mailbox.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mailbox/imx-mailbox.c b/drivers/mailbox/imx-mailbox.c index d17efb1dd0cb1..f815dab3be50c 100644 --- a/drivers/mailbox/imx-mailbox.c +++ b/drivers/mailbox/imx-mailbox.c @@ -30,7 +30,7 @@ #define IMX_MU_SCU_CHANS 6 /* TX0/RX0 */ #define IMX_MU_S4_CHANS 2 -#define IMX_MU_CHAN_NAME_SIZE 20 +#define IMX_MU_CHAN_NAME_SIZE 32 #define IMX_MU_V2_PAR_OFF 0x4 #define IMX_MU_V2_TR_MASK GENMASK(7, 0) @@ -782,7 +782,7 @@ static int imx_mu_init_generic(struct imx_mu_priv *priv) cp->chan = &priv->mbox_chans[i]; priv->mbox_chans[i].con_priv = cp; snprintf(cp->irq_desc, sizeof(cp->irq_desc), - "imx_mu_chan[%i-%i]", cp->type, cp->idx); + "%s[%i-%i]", dev_name(priv->dev), cp->type, cp->idx); } priv->mbox.num_chans = IMX_MU_CHANS; @@ -819,7 +819,7 @@ static int imx_mu_init_specific(struct imx_mu_priv *priv) cp->chan = &priv->mbox_chans[i]; priv->mbox_chans[i].con_priv = cp; snprintf(cp->irq_desc, sizeof(cp->irq_desc), - "imx_mu_chan[%i-%i]", cp->type, cp->idx); + "%s[%i-%i]", dev_name(priv->dev), cp->type, cp->idx); } priv->mbox.num_chans = num_chans; From e92d87c9c5d769e4cb1dd7c90faa38dddd7e52e3 Mon Sep 17 00:00:00 2001 From: Liao Chen Date: Wed, 14 Aug 2024 02:51:47 +0000 Subject: [PATCH 250/352] mailbox: rockchip: fix a typo in module autoloading MODULE_DEVICE_TABLE(of, rockchip_mbox_of_match) could let the module properly autoloaded based on the alias from of_device_id table. It should be 'rockchip_mbox_of_match' instead of 'rockchp_mbox_of_match', just fix it. Fixes: f70ed3b5dc8b ("mailbox: rockchip: Add Rockchip mailbox driver") Signed-off-by: Liao Chen Reviewed-by: Heiko Stuebner Signed-off-by: Jassi Brar --- drivers/mailbox/rockchip-mailbox.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mailbox/rockchip-mailbox.c b/drivers/mailbox/rockchip-mailbox.c index 8ffad059e8984..4d966cb2ed036 100644 --- a/drivers/mailbox/rockchip-mailbox.c +++ b/drivers/mailbox/rockchip-mailbox.c @@ -159,7 +159,7 @@ static const struct of_device_id rockchip_mbox_of_match[] = { { .compatible = "rockchip,rk3368-mailbox", .data = &rk3368_drv_data}, { }, }; -MODULE_DEVICE_TABLE(of, rockchp_mbox_of_match); +MODULE_DEVICE_TABLE(of, rockchip_mbox_of_match); static int rockchip_mbox_probe(struct platform_device *pdev) { From 0d97651b7577148242571b8692aae4a8b9ee0979 Mon Sep 17 00:00:00 2001 From: Huan Yang Date: Thu, 22 Aug 2024 09:59:55 +0800 Subject: [PATCH 251/352] mailbox: sprd: Use devm_clk_get_enabled() helpers The devm_clk_get_enabled() helpers: - call devm_clk_get() - call clk_prepare_enable() and register what is needed in order to call clk_disable_unprepare() when needed, as a managed resource. This simplifies the code and avoids the calls to clk_disable_unprepare(). Due to clk only used in probe, not in suspend\resume, this pointer can remove from sprd_mbox_priv to save a little memory. Signed-off-by: Huan Yang Reviewed-by: Christophe JAILLET Reviewed-by: Baolin Wang Signed-off-by: Jassi Brar --- drivers/mailbox/sprd-mailbox.c | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/drivers/mailbox/sprd-mailbox.c b/drivers/mailbox/sprd-mailbox.c index 9ae57de77d4d7..ee8539dfcef54 100644 --- a/drivers/mailbox/sprd-mailbox.c +++ b/drivers/mailbox/sprd-mailbox.c @@ -62,7 +62,6 @@ struct sprd_mbox_priv { void __iomem *outbox_base; /* Base register address for supplementary outbox */ void __iomem *supp_base; - struct clk *clk; u32 outbox_fifo_depth; struct mutex lock; @@ -291,19 +290,13 @@ static const struct mbox_chan_ops sprd_mbox_ops = { .shutdown = sprd_mbox_shutdown, }; -static void sprd_mbox_disable(void *data) -{ - struct sprd_mbox_priv *priv = data; - - clk_disable_unprepare(priv->clk); -} - static int sprd_mbox_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct sprd_mbox_priv *priv; int ret, inbox_irq, outbox_irq, supp_irq; unsigned long id, supp; + struct clk *clk; priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); if (!priv) @@ -331,20 +324,10 @@ static int sprd_mbox_probe(struct platform_device *pdev) if (IS_ERR(priv->outbox_base)) return PTR_ERR(priv->outbox_base); - priv->clk = devm_clk_get(dev, "enable"); - if (IS_ERR(priv->clk)) { + clk = devm_clk_get_enabled(dev, "enable"); + if (IS_ERR(clk)) { dev_err(dev, "failed to get mailbox clock\n"); - return PTR_ERR(priv->clk); - } - - ret = clk_prepare_enable(priv->clk); - if (ret) - return ret; - - ret = devm_add_action_or_reset(dev, sprd_mbox_disable, priv); - if (ret) { - dev_err(dev, "failed to add mailbox disable action\n"); - return ret; + return PTR_ERR(clk); } inbox_irq = platform_get_irq_byname(pdev, "inbox"); From dc09f007caed3b2f6a3b6bd7e13777557ae22bfd Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Wed, 21 Aug 2024 23:40:44 +0200 Subject: [PATCH 252/352] mailbox: bcm2835: Fix timeout during suspend mode During noirq suspend phase the Raspberry Pi power driver suffer of firmware property timeouts. The reason is that the IRQ of the underlying BCM2835 mailbox is disabled and rpi_firmware_property_list() will always run into a timeout [1]. Since the VideoCore side isn't consider as a wakeup source, set the IRQF_NO_SUSPEND flag for the mailbox IRQ in order to keep it enabled during suspend-resume cycle. [1] PM: late suspend of devices complete after 1.754 msecs WARNING: CPU: 0 PID: 438 at drivers/firmware/raspberrypi.c:128 rpi_firmware_property_list+0x204/0x22c Firmware transaction 0x00028001 timeout Modules linked in: CPU: 0 PID: 438 Comm: bash Tainted: G C 6.9.3-dirty #17 Hardware name: BCM2835 Call trace: unwind_backtrace from show_stack+0x18/0x1c show_stack from dump_stack_lvl+0x34/0x44 dump_stack_lvl from __warn+0x88/0xec __warn from warn_slowpath_fmt+0x7c/0xb0 warn_slowpath_fmt from rpi_firmware_property_list+0x204/0x22c rpi_firmware_property_list from rpi_firmware_property+0x68/0x8c rpi_firmware_property from rpi_firmware_set_power+0x54/0xc0 rpi_firmware_set_power from _genpd_power_off+0xe4/0x148 _genpd_power_off from genpd_sync_power_off+0x7c/0x11c genpd_sync_power_off from genpd_finish_suspend+0xcc/0xe0 genpd_finish_suspend from dpm_run_callback+0x78/0xd0 dpm_run_callback from device_suspend_noirq+0xc0/0x238 device_suspend_noirq from dpm_suspend_noirq+0xb0/0x168 dpm_suspend_noirq from suspend_devices_and_enter+0x1b8/0x5ac suspend_devices_and_enter from pm_suspend+0x254/0x2e4 pm_suspend from state_store+0xa8/0xd4 state_store from kernfs_fop_write_iter+0x154/0x1a0 kernfs_fop_write_iter from vfs_write+0x12c/0x184 vfs_write from ksys_write+0x78/0xc0 ksys_write from ret_fast_syscall+0x0/0x54 Exception stack(0xcc93dfa8 to 0xcc93dff0) [...] PM: noirq suspend of devices complete after 3095.584 msecs Link: https://github.com/raspberrypi/firmware/issues/1894 Fixes: 0bae6af6d704 ("mailbox: Enable BCM2835 mailbox support") Signed-off-by: Stefan Wahren Reviewed-by: Florian Fainelli Signed-off-by: Jassi Brar --- drivers/mailbox/bcm2835-mailbox.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mailbox/bcm2835-mailbox.c b/drivers/mailbox/bcm2835-mailbox.c index fbfd0202047c3..ea12fb8d24015 100644 --- a/drivers/mailbox/bcm2835-mailbox.c +++ b/drivers/mailbox/bcm2835-mailbox.c @@ -145,7 +145,8 @@ static int bcm2835_mbox_probe(struct platform_device *pdev) spin_lock_init(&mbox->lock); ret = devm_request_irq(dev, irq_of_parse_and_map(dev->of_node, 0), - bcm2835_mbox_irq, 0, dev_name(dev), mbox); + bcm2835_mbox_irq, IRQF_NO_SUSPEND, dev_name(dev), + mbox); if (ret) { dev_err(dev, "Failed to register a mailbox IRQ handler: %d\n", ret); From 263dbd3cc88da7ea7413494eea66418b4f1b2e6d Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 31 Jul 2024 14:16:08 -0600 Subject: [PATCH 253/352] mailbox: Use of_property_match_string() instead of open-coding Use of_property_match_string() instead of open-coding the search. With this, of_get_property() can be removed as there is no need to check for "mbox-names" presence first. This is part of a larger effort to remove callers of of_get_property() and similar functions. of_get_property() leaks the DT property data pointer which is a problem for dynamically allocated nodes which may be freed. Signed-off-by: Rob Herring (Arm) Signed-off-by: Jassi Brar --- drivers/mailbox/mailbox.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c index ebff3baf30451..d3d26a2c98956 100644 --- a/drivers/mailbox/mailbox.c +++ b/drivers/mailbox/mailbox.c @@ -450,30 +450,20 @@ struct mbox_chan *mbox_request_channel_byname(struct mbox_client *cl, const char *name) { struct device_node *np = cl->dev->of_node; - struct property *prop; - const char *mbox_name; - int index = 0; + int index; if (!np) { dev_err(cl->dev, "%s() currently only supports DT\n", __func__); return ERR_PTR(-EINVAL); } - if (!of_get_property(np, "mbox-names", NULL)) { - dev_err(cl->dev, - "%s() requires an \"mbox-names\" property\n", __func__); + index = of_property_match_string(np, "mbox-names", name); + if (index < 0) { + dev_err(cl->dev, "%s() could not locate channel named \"%s\"\n", + __func__, name); return ERR_PTR(-EINVAL); } - - of_property_for_each_string(np, "mbox-names", prop, mbox_name) { - if (!strncmp(name, mbox_name, strlen(name))) - return mbox_request_channel(cl, index); - index++; - } - - dev_err(cl->dev, "%s() could not locate channel named \"%s\"\n", - __func__, name); - return ERR_PTR(-EINVAL); + return mbox_request_channel(cl, index); } EXPORT_SYMBOL_GPL(mbox_request_channel_byname); From c13c196d5e5c96356386a8099e14690e393a5148 Mon Sep 17 00:00:00 2001 From: Fei Shao Date: Wed, 11 Sep 2024 22:33:55 +0800 Subject: [PATCH 254/352] dt-bindings: mailbox: mtk,adsp-mbox: Add compatible for MT8188 Add compatible string for ADSP mailbox on MT8188 SoC, which is compatible with the one used on MT8186. Acked-by: Rob Herring (Arm) Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Fei Shao Signed-off-by: Jassi Brar --- .../devicetree/bindings/mailbox/mtk,adsp-mbox.yaml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/mailbox/mtk,adsp-mbox.yaml b/Documentation/devicetree/bindings/mailbox/mtk,adsp-mbox.yaml index 72c1d9e82c897..8a1369df4ecb2 100644 --- a/Documentation/devicetree/bindings/mailbox/mtk,adsp-mbox.yaml +++ b/Documentation/devicetree/bindings/mailbox/mtk,adsp-mbox.yaml @@ -17,9 +17,15 @@ description: | properties: compatible: - enum: - - mediatek,mt8195-adsp-mbox - - mediatek,mt8186-adsp-mbox + oneOf: + - enum: + - mediatek,mt8186-adsp-mbox + - mediatek,mt8195-adsp-mbox + - items: + - enum: + - mediatek,mt8188-adsp-mbox + - const: mediatek,mt8186-adsp-mbox + "#mbox-cells": const: 0 From 4116ab5e8a48db72ca366ac042641dea8cf92f67 Mon Sep 17 00:00:00 2001 From: Nikunj Kela Date: Thu, 5 Sep 2024 11:47:36 -0700 Subject: [PATCH 255/352] dt-bindings: mailbox: qcom-ipcc: document the support for SA8255p Add a compatible for the ipcc on SA8255p platforms. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Nikunj Kela Signed-off-by: Jassi Brar --- Documentation/devicetree/bindings/mailbox/qcom-ipcc.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/mailbox/qcom-ipcc.yaml b/Documentation/devicetree/bindings/mailbox/qcom-ipcc.yaml index 05e4e1d51713a..bc108b8db9f48 100644 --- a/Documentation/devicetree/bindings/mailbox/qcom-ipcc.yaml +++ b/Documentation/devicetree/bindings/mailbox/qcom-ipcc.yaml @@ -25,6 +25,7 @@ properties: items: - enum: - qcom,qdu1000-ipcc + - qcom,sa8255p-ipcc - qcom,sa8775p-ipcc - qcom,sc7280-ipcc - qcom,sc8280xp-ipcc From 5232544ea368b54b517dc504308c9e62bc6e87eb Mon Sep 17 00:00:00 2001 From: Jingyi Wang Date: Wed, 11 Sep 2024 15:25:15 +0800 Subject: [PATCH 256/352] dt-bindings: mailbox: qcom-ipcc: Document QCS8300 IPCC Document the Inter-Processor Communication Controller on the Qualcomm QCS8300 Platform, which will be used to route interrupts across various subsystems found on the SoC. Signed-off-by: Jingyi Wang Reviewed-by: Krzysztof Kozlowski Signed-off-by: Jassi Brar --- Documentation/devicetree/bindings/mailbox/qcom-ipcc.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/mailbox/qcom-ipcc.yaml b/Documentation/devicetree/bindings/mailbox/qcom-ipcc.yaml index bc108b8db9f48..2d66770ed3612 100644 --- a/Documentation/devicetree/bindings/mailbox/qcom-ipcc.yaml +++ b/Documentation/devicetree/bindings/mailbox/qcom-ipcc.yaml @@ -24,6 +24,7 @@ properties: compatible: items: - enum: + - qcom,qcs8300-ipcc - qcom,qdu1000-ipcc - qcom,sa8255p-ipcc - qcom,sa8775p-ipcc From 8e391ae0607fc820e9ece87d1415e6a1ff274e69 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 19 Sep 2024 14:40:04 +0200 Subject: [PATCH 257/352] s390/vdso: Get rid of permutation constants The three byte masks for VECTOR PERMUTE are not needed, since the instruction VECTOR SHIFT LEFT DOUBLE BY BYTE can be used to implement the required "rotate left" easily. Signed-off-by: Heiko Carstens Reviewed-by: Jens Remus Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso64/vgetrandom-chacha.S | 25 +++++++++------------ 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/arch/s390/kernel/vdso64/vgetrandom-chacha.S b/arch/s390/kernel/vdso64/vgetrandom-chacha.S index d802b0a96f414..475711e0c6a36 100644 --- a/arch/s390/kernel/vdso64/vgetrandom-chacha.S +++ b/arch/s390/kernel/vdso64/vgetrandom-chacha.S @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include #include #include #include @@ -12,9 +13,6 @@ #define COPY1 %v5 #define COPY2 %v6 #define COPY3 %v7 -#define PERM4 %v16 -#define PERM8 %v17 -#define PERM12 %v18 #define BEPERM %v19 #define TMP0 %v20 #define TMP1 %v21 @@ -23,12 +21,9 @@ .section .rodata - .balign 128 + .balign 32 .Lconstants: .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral - .long 0x04050607,0x08090a0b,0x0c0d0e0f,0x00010203 # rotl 4 bytes - .long 0x08090a0b,0x0c0d0e0f,0x00010203,0x04050607 # rotl 8 bytes - .long 0x0c0d0e0f,0x00010203,0x04050607,0x08090a0b # rotl 12 bytes .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap .text @@ -48,8 +43,8 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack) /* COPY0 = "expand 32-byte k" */ VL COPY0,0,,%r1 - /* PERM4-PERM12,BEPERM = byte selectors for VPERM */ - VLM PERM4,BEPERM,16,%r1 + /* BEPERM = byte selectors for VPERM */ + ALTERNATIVE __stringify(VL BEPERM,16,,%r1), "brcl 0,0", ALT_FACILITY(148) /* COPY1,COPY2 = key */ VLM COPY1,COPY2,0,%r3 @@ -89,11 +84,11 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack) VERLLF STATE1,STATE1,7 /* STATE1[0,1,2,3] = STATE1[1,2,3,0] */ - VPERM STATE1,STATE1,STATE1,PERM4 + VSLDB STATE1,STATE1,STATE1,4 /* STATE2[0,1,2,3] = STATE2[2,3,0,1] */ - VPERM STATE2,STATE2,STATE2,PERM8 + VSLDB STATE2,STATE2,STATE2,8 /* STATE3[0,1,2,3] = STATE3[3,0,1,2] */ - VPERM STATE3,STATE3,STATE3,PERM12 + VSLDB STATE3,STATE3,STATE3,12 /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */ VAF STATE0,STATE0,STATE1 @@ -116,11 +111,11 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack) VERLLF STATE1,STATE1,7 /* STATE1[0,1,2,3] = STATE1[3,0,1,2] */ - VPERM STATE1,STATE1,STATE1,PERM12 + VSLDB STATE1,STATE1,STATE1,12 /* STATE2[0,1,2,3] = STATE2[2,3,0,1] */ - VPERM STATE2,STATE2,STATE2,PERM8 + VSLDB STATE2,STATE2,STATE2,8 /* STATE3[0,1,2,3] = STATE3[1,2,3,0] */ - VPERM STATE3,STATE3,STATE3,PERM4 + VSLDB STATE3,STATE3,STATE3,4 brctg %r0,.Ldoubleround /* OUTPUT0 = STATE0 + STATE0 */ From ff35a3f0ca5c88145c6da6630f3420071dfa296c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 19 Sep 2024 14:40:05 +0200 Subject: [PATCH 258/352] s390/vdso: Fix comment within __arch_chacha20_blocks_nostack() Fix comment within __arch_chacha20_blocks_nostack() so the comment reflects what the code is doing. Signed-off-by: Heiko Carstens Reviewed-by: Jens Remus Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso64/vgetrandom-chacha.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/s390/kernel/vdso64/vgetrandom-chacha.S b/arch/s390/kernel/vdso64/vgetrandom-chacha.S index 475711e0c6a36..3f7e30886d1b4 100644 --- a/arch/s390/kernel/vdso64/vgetrandom-chacha.S +++ b/arch/s390/kernel/vdso64/vgetrandom-chacha.S @@ -118,13 +118,13 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack) VSLDB STATE3,STATE3,STATE3,4 brctg %r0,.Ldoubleround - /* OUTPUT0 = STATE0 + STATE0 */ + /* OUTPUT0 = STATE0 + COPY0 */ VAF STATE0,STATE0,COPY0 - /* OUTPUT1 = STATE1 + STATE1 */ + /* OUTPUT1 = STATE1 + COPY1 */ VAF STATE1,STATE1,COPY1 - /* OUTPUT2 = STATE2 + STATE2 */ + /* OUTPUT2 = STATE2 + COPY2 */ VAF STATE2,STATE2,COPY2 - /* OUTPUT2 = STATE3 + STATE3 */ + /* OUTPUT3 = STATE3 + COPY3 */ VAF STATE3,STATE3,COPY3 /* From 5cccfc8be6d256e91d155313edef20c1a89064b2 Mon Sep 17 00:00:00 2001 From: Jens Remus Date: Thu, 19 Sep 2024 14:40:06 +0200 Subject: [PATCH 259/352] s390/vdso: Add CFI annotations to __arch_chacha20_blocks_nostack() This allows proper unwinding, for instance when using a debugger such as GDB. Signed-off-by: Jens Remus Signed-off-by: Heiko Carstens Acked-by: Vasily Gorbik Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso64/vgetrandom-chacha.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/s390/kernel/vdso64/vgetrandom-chacha.S b/arch/s390/kernel/vdso64/vgetrandom-chacha.S index 3f7e30886d1b4..894954bf3b419 100644 --- a/arch/s390/kernel/vdso64/vgetrandom-chacha.S +++ b/arch/s390/kernel/vdso64/vgetrandom-chacha.S @@ -3,6 +3,7 @@ #include #include #include +#include #include #define STATE0 %v0 @@ -38,6 +39,7 @@ * size_t nblocks) */ SYM_FUNC_START(__arch_chacha20_blocks_nostack) + CFI_STARTPROC larl %r1,.Lconstants /* COPY0 = "expand 32-byte k" */ @@ -177,4 +179,5 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack) VPERM TMP3,STATE3,STATE3,BEPERM VSTM TMP0,TMP3,0,%r2 j .Lstoredone + CFI_ENDPROC SYM_FUNC_END(__arch_chacha20_blocks_nostack) From d361390d9f2a332b52458cc69f1fe8e76d6c2943 Mon Sep 17 00:00:00 2001 From: Jens Remus Date: Thu, 19 Sep 2024 14:40:07 +0200 Subject: [PATCH 260/352] s390/vdso: Use macros for annotation of asm functions Use the macros SYM_FUNC_START() and SYM_FUNC_END() to annotate the functions in assembly. Signed-off-by: Jens Remus Signed-off-by: Heiko Carstens Acked-by: Vasily Gorbik Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso64/vdso_user_wrapper.S | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S index e26e68675c08d..aa06c85bcbd35 100644 --- a/arch/s390/kernel/vdso64/vdso_user_wrapper.S +++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S @@ -13,10 +13,7 @@ * for details. */ .macro vdso_func func - .globl __kernel_\func - .type __kernel_\func,@function - __ALIGN -__kernel_\func: +SYM_FUNC_START(__kernel_\func) CFI_STARTPROC aghi %r15,-STACK_FRAME_VDSO_OVERHEAD CFI_DEF_CFA_OFFSET (STACK_FRAME_USER_OVERHEAD + STACK_FRAME_VDSO_OVERHEAD) @@ -32,7 +29,7 @@ __kernel_\func: CFI_RESTORE 15 br %r14 CFI_ENDPROC - .size __kernel_\func,.-__kernel_\func +SYM_FUNC_END(__kernel_\func) .endm vdso_func gettimeofday @@ -41,16 +38,13 @@ vdso_func clock_gettime vdso_func getcpu .macro vdso_syscall func,syscall - .globl __kernel_\func - .type __kernel_\func,@function - __ALIGN -__kernel_\func: +SYM_FUNC_START(__kernel_\func) CFI_STARTPROC svc \syscall /* Make sure we notice when a syscall returns, which shouldn't happen */ .word 0 CFI_ENDPROC - .size __kernel_\func,.-__kernel_\func +SYM_FUNC_END(__kernel_\func) .endm vdso_syscall restart_syscall,__NR_restart_syscall From e08ec26928554c36e34e089f663dc9114d77b68c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 19 Sep 2024 14:40:08 +0200 Subject: [PATCH 261/352] tools: Add additional SYM_*() stubs to linkage.h Similar to commit f8d92fc527ff ("selftests: vDSO: fix include order in build of test_vdso_chacha") add SYM_DATA_START, SYM_DATA_START_LOCAL, and SYM_DATA_END stubs to tools/include/linux/linkage.h so that the proper macros can be used within the kernel's vdso code as well as in the vdso_test_chacha selftest. Signed-off-by: Heiko Carstens Reviewed-by: Jens Remus Signed-off-by: Vasily Gorbik --- tools/include/linux/linkage.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/include/linux/linkage.h b/tools/include/linux/linkage.h index a48ff086899cd..a89620c550ed9 100644 --- a/tools/include/linux/linkage.h +++ b/tools/include/linux/linkage.h @@ -2,7 +2,9 @@ #define _TOOLS_INCLUDE_LINUX_LINKAGE_H #define SYM_FUNC_START(x) .globl x; x: - #define SYM_FUNC_END(x) +#define SYM_DATA_START(x) .globl x; x: +#define SYM_DATA_START_LOCAL(x) x: +#define SYM_DATA_END(x) #endif /* _TOOLS_INCLUDE_LINUX_LINKAGE_H */ From c902b578eebfe0739e8ec491b60f2f37dfeb09c0 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 19 Sep 2024 14:40:09 +0200 Subject: [PATCH 262/352] s390/vdso: Use SYM_DATA_START_LOCAL()/SYM_DATA_END() for data objects Use SYM_DATA_START_LOCAL()/SYM_DATA_END() in vgetrandom-chacha.S so that the constants end up in an object with correct size: readelf -Ws vgetrandom-chacha.o Num: Value Size Type Bind Vis Ndx Name ... 5: 0000000000000000 32 OBJECT LOCAL DEFAULT 5 chacha20_constants Signed-off-by: Heiko Carstens Reviewed-by: Jens Remus Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso64/vgetrandom-chacha.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/vdso64/vgetrandom-chacha.S b/arch/s390/kernel/vdso64/vgetrandom-chacha.S index 894954bf3b419..4c52ba78e060f 100644 --- a/arch/s390/kernel/vdso64/vgetrandom-chacha.S +++ b/arch/s390/kernel/vdso64/vgetrandom-chacha.S @@ -23,9 +23,10 @@ .section .rodata .balign 32 -.Lconstants: +SYM_DATA_START_LOCAL(chacha20_constants) .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap +SYM_DATA_END(chacha20_constants) .text /* @@ -40,7 +41,7 @@ */ SYM_FUNC_START(__arch_chacha20_blocks_nostack) CFI_STARTPROC - larl %r1,.Lconstants + larl %r1,chacha20_constants /* COPY0 = "expand 32-byte k" */ VL COPY0,0,,%r1 From d714abee5fb64c4817dce477bd7f2bd1bb4fe814 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 19 Sep 2024 14:40:10 +0200 Subject: [PATCH 263/352] s390/vdso: Use one large alternative instead of an alternative branch Replace the alternative branch with a larger alternative that contains both paths. That way the two paths are closer together and it is easier to change both paths if the need should arise. Signed-off-by: Heiko Carstens Reviewed-by: Jens Remus Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso64/vgetrandom-chacha.S | 35 ++++++++++----------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/arch/s390/kernel/vdso64/vgetrandom-chacha.S b/arch/s390/kernel/vdso64/vgetrandom-chacha.S index 4c52ba78e060f..09c034c2f8531 100644 --- a/arch/s390/kernel/vdso64/vgetrandom-chacha.S +++ b/arch/s390/kernel/vdso64/vgetrandom-chacha.S @@ -130,16 +130,22 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack) /* OUTPUT3 = STATE3 + COPY3 */ VAF STATE3,STATE3,COPY3 - /* - * 32 bit wise little endian store to OUTPUT. If the vector - * enhancement facility 2 is not installed use the slow path. - */ - ALTERNATIVE "brc 0xf,.Lstoreslow", "nop", ALT_FACILITY(148) - VSTBRF STATE0,0,,%r2 - VSTBRF STATE1,16,,%r2 - VSTBRF STATE2,32,,%r2 - VSTBRF STATE3,48,,%r2 -.Lstoredone: + ALTERNATIVE \ + __stringify( \ + /* Convert STATE to little endian and store to OUTPUT */\ + VPERM TMP0,STATE0,STATE0,BEPERM; \ + VPERM TMP1,STATE1,STATE1,BEPERM; \ + VPERM TMP2,STATE2,STATE2,BEPERM; \ + VPERM TMP3,STATE3,STATE3,BEPERM; \ + VSTM TMP0,TMP3,0,%r2), \ + __stringify( \ + /* 32 bit wise little endian store to OUTPUT */ \ + VSTBRF STATE0,0,,%r2; \ + VSTBRF STATE1,16,,%r2; \ + VSTBRF STATE2,32,,%r2; \ + VSTBRF STATE3,48,,%r2; \ + brcl 0,0), \ + ALT_FACILITY(148) /* ++COPY3.COUNTER */ /* alsih %r3,1 */ @@ -171,14 +177,5 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack) VZERO TMP3 br %r14 - -.Lstoreslow: - /* Convert STATE to little endian format and store to OUTPUT */ - VPERM TMP0,STATE0,STATE0,BEPERM - VPERM TMP1,STATE1,STATE1,BEPERM - VPERM TMP2,STATE2,STATE2,BEPERM - VPERM TMP3,STATE3,STATE3,BEPERM - VSTM TMP0,TMP3,0,%r2 - j .Lstoredone CFI_ENDPROC SYM_FUNC_END(__arch_chacha20_blocks_nostack) From 2d8721364ce83956d0a184a64052928589ef15df Mon Sep 17 00:00:00 2001 From: "Jason J. Herne" Date: Mon, 16 Sep 2024 08:01:23 -0400 Subject: [PATCH 264/352] s390/vfio-ap: Driver feature advertisement Advertise features of the driver for the benefit of automated tooling like Libvirt and mdevctl. Signed-off-by: Jason J. Herne Reviewed-by: Anthony Krowiak Reviewed-by: Boris Fiuczynski Link: https://lore.kernel.org/r/20240916120123.11484-1-jjherne@linux.ibm.com Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- Documentation/arch/s390/vfio-ap.rst | 30 +++++++++++++++++++++++++++++ drivers/s390/crypto/vfio_ap_drv.c | 13 +++++++++++++ 2 files changed, 43 insertions(+) diff --git a/Documentation/arch/s390/vfio-ap.rst b/Documentation/arch/s390/vfio-ap.rst index ea744cbc8687e..eba1991fbdba5 100644 --- a/Documentation/arch/s390/vfio-ap.rst +++ b/Documentation/arch/s390/vfio-ap.rst @@ -999,6 +999,36 @@ the vfio_ap mediated device to which it is assigned as long as each new APQN resulting from plugging it in references a queue device bound to the vfio_ap device driver. +Driver Features +=============== +The vfio_ap driver exposes a sysfs file containing supported features. +This exists so third party tools (like Libvirt and mdevctl) can query the +availability of specific features. + +The features list can be found here: /sys/bus/matrix/devices/matrix/features + +Entries are space delimited. Each entry consists of a combination of +alphanumeric and underscore characters. + +Example: +cat /sys/bus/matrix/devices/matrix/features +guest_matrix dyn ap_config + +the following features are advertised: + +---------------+---------------------------------------------------------------+ +| Flag | Description | ++==============+===============================================================+ +| guest_matrix | guest_matrix attribute exists. It reports the matrix of | +| | adapters and domains that are or will be passed through to a | +| | guest when the mdev is attached to it. | ++--------------+---------------------------------------------------------------+ +| dyn | Indicates hot plug/unplug of AP adapters, domains and control | +| | domains for a guest to which the mdev is attached. | ++------------+-----------------------------------------------------------------+ +| ap_config | ap_config interface for one-shot modifications to mdev config | ++--------------+---------------------------------------------------------------+ + Limitations =========== Live guest migration is not supported for guests using AP devices without diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c index 4aeb3e1213c77..67a807e2e75b1 100644 --- a/drivers/s390/crypto/vfio_ap_drv.c +++ b/drivers/s390/crypto/vfio_ap_drv.c @@ -26,6 +26,18 @@ MODULE_LICENSE("GPL v2"); struct ap_matrix_dev *matrix_dev; debug_info_t *vfio_ap_dbf_info; +static ssize_t features_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "guest_matrix hotplug ap_config\n"); +} +static DEVICE_ATTR_RO(features); + +static struct attribute *matrix_dev_attrs[] = { + &dev_attr_features.attr, + NULL, +}; +ATTRIBUTE_GROUPS(matrix_dev); + /* Only type 10 adapters (CEX4 and later) are supported * by the AP matrix device driver */ @@ -68,6 +80,7 @@ static struct device_driver matrix_driver = { .name = "vfio_ap", .bus = &matrix_bus, .suppress_bind_attrs = true, + .dev_groups = matrix_dev_groups, }; static int vfio_ap_matrix_dev_create(void) From 6d12d7ace99ec74cb1f479bb851b5ed65b3bc105 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 22 Sep 2024 02:10:30 -0400 Subject: [PATCH 265/352] bcachefs: Ensure BCH_FS_accounting_replay_done is always set if it doesn't get set we'll never be able to flush the btree write buffer; this only happens in fake rw mode, but prevents us from shutting down. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index be1e7ca4362fd..d1699aecc9fb6 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -862,6 +862,9 @@ int bch2_fs_recovery(struct bch_fs *c) clear_bit(BCH_FS_fsck_running, &c->flags); + /* in case we don't run journal replay, i.e. norecovery mode */ + set_bit(BCH_FS_accounting_replay_done, &c->flags); + /* fsync if we fixed errors */ if (test_bit(BCH_FS_errors_fixed, &c->flags) && bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) { From 7eb4a319db65566005989121563ead344ca79140 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 16:39:49 -0400 Subject: [PATCH 266/352] bcachefs: Fix infinite loop in propagate_key_to_snapshot_leaves() As we iterate we need to mark that we no longer need iterators - otherwise we'll infinite loop via the "too many iters" check when there's many snapshots. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index dff83ebbd912c..1809442b00ee4 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1784,6 +1784,7 @@ static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, new->k.p.snapshot = leaf_id; ret = bch2_trans_update(trans, &iter, new, 0); out: + bch2_set_btree_iter_dontneed(&iter); bch2_trans_iter_exit(trans, &iter); return ret; } From 5d69d5a00f80488ddcb4dee7d1374a0709398178 Mon Sep 17 00:00:00 2001 From: Kimriver Liu Date: Fri, 13 Sep 2024 11:31:46 +0800 Subject: [PATCH 267/352] i2c: designware: fix controller is holding SCL low while ENABLE bit is disabled It was observed that issuing the ABORT bit (IC_ENABLE[1]) will not work when IC_ENABLE is already disabled. Check if the ENABLE bit (IC_ENABLE[0]) is disabled when the controller is holding SCL low. If the ENABLE bit is disabled, the software needs to enable it before trying to issue the ABORT bit. otherwise, the controller ignores any write to ABORT bit. These kernel logs show up whenever an I2C transaction is attempted after this failure. i2c_designware e95e0000.i2c: timeout waiting for bus ready i2c_designware e95e0000.i2c: timeout in disabling adapter The patch fixes the issue where the controller cannot be disabled while SCL is held low if the ENABLE bit is already disabled. Fixes: 2409205acd3c ("i2c: designware: fix __i2c_dw_disable() in case master is holding SCL low") Signed-off-by: Kimriver Liu Cc: # v6.6+ Reviewed-by: Mika Westerberg Acked-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-designware-common.c | 14 ++++++++ drivers/i2c/busses/i2c-designware-core.h | 1 + drivers/i2c/busses/i2c-designware-master.c | 38 ++++++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/drivers/i2c/busses/i2c-designware-common.c b/drivers/i2c/busses/i2c-designware-common.c index 080204182bb5a..f31d352d98b57 100644 --- a/drivers/i2c/busses/i2c-designware-common.c +++ b/drivers/i2c/busses/i2c-designware-common.c @@ -523,6 +523,7 @@ int i2c_dw_set_sda_hold(struct dw_i2c_dev *dev) void __i2c_dw_disable(struct dw_i2c_dev *dev) { + struct i2c_timings *t = &dev->timings; unsigned int raw_intr_stats; unsigned int enable; int timeout = 100; @@ -535,6 +536,19 @@ void __i2c_dw_disable(struct dw_i2c_dev *dev) abort_needed = raw_intr_stats & DW_IC_INTR_MST_ON_HOLD; if (abort_needed) { + if (!(enable & DW_IC_ENABLE_ENABLE)) { + regmap_write(dev->map, DW_IC_ENABLE, DW_IC_ENABLE_ENABLE); + /* + * Wait 10 times the signaling period of the highest I2C + * transfer supported by the driver (for 400KHz this is + * 25us) to ensure the I2C ENABLE bit is already set + * as described in the DesignWare I2C databook. + */ + fsleep(DIV_ROUND_CLOSEST_ULL(10 * MICRO, t->bus_freq_hz)); + /* Set ENABLE bit before setting ABORT */ + enable |= DW_IC_ENABLE_ENABLE; + } + regmap_write(dev->map, DW_IC_ENABLE, enable | DW_IC_ENABLE_ABORT); ret = regmap_read_poll_timeout(dev->map, DW_IC_ENABLE, enable, !(enable & DW_IC_ENABLE_ABORT), 10, diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h index 1ac2afd03a0a5..8e8854ec98825 100644 --- a/drivers/i2c/busses/i2c-designware-core.h +++ b/drivers/i2c/busses/i2c-designware-core.h @@ -108,6 +108,7 @@ DW_IC_INTR_RX_UNDER | \ DW_IC_INTR_RD_REQ) +#define DW_IC_ENABLE_ENABLE BIT(0) #define DW_IC_ENABLE_ABORT BIT(1) #define DW_IC_STATUS_ACTIVITY BIT(0) diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index e46f1b22c3601..e8ac9a7bf0b3d 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -271,6 +271,34 @@ static void i2c_dw_xfer_init(struct dw_i2c_dev *dev) __i2c_dw_write_intr_mask(dev, DW_IC_INTR_MASTER_MASK); } +/* + * This function waits for the controller to be idle before disabling I2C + * When the controller is not in the IDLE state, the MST_ACTIVITY bit + * (IC_STATUS[5]) is set. + * + * Values: + * 0x1 (ACTIVE): Controller not idle + * 0x0 (IDLE): Controller is idle + * + * The function is called after completing the current transfer. + * + * Returns: + * False when the controller is in the IDLE state. + * True when the controller is in the ACTIVE state. + */ +static bool i2c_dw_is_controller_active(struct dw_i2c_dev *dev) +{ + u32 status; + + regmap_read(dev->map, DW_IC_STATUS, &status); + if (!(status & DW_IC_STATUS_MASTER_ACTIVITY)) + return false; + + return regmap_read_poll_timeout(dev->map, DW_IC_STATUS, status, + !(status & DW_IC_STATUS_MASTER_ACTIVITY), + 1100, 20000) != 0; +} + static int i2c_dw_check_stopbit(struct dw_i2c_dev *dev) { u32 val; @@ -806,6 +834,16 @@ i2c_dw_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num) goto done; } + /* + * This happens rarely (~1:500) and is hard to reproduce. Debug trace + * showed that IC_STATUS had value of 0x23 when STOP_DET occurred, + * if disable IC_ENABLE.ENABLE immediately that can result in + * IC_RAW_INTR_STAT.MASTER_ON_HOLD holding SCL low. Check if + * controller is still ACTIVE before disabling I2C. + */ + if (i2c_dw_is_controller_active(dev)) + dev_err(dev->dev, "controller active\n"); + /* * We must disable the adapter before returning and signaling the end * of the current transfer. Otherwise the hardware might continue From f2990f8630531a99cad4dc5c44cb2a11ded42492 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 12 Sep 2024 12:46:31 +0200 Subject: [PATCH 268/352] i2c: synquacer: Deal with optional PCLK correctly ACPI boot does not provide clocks and regulators, but instead, provides the PCLK rate directly, and enables the clock in firmware. So deal gracefully with this. Fixes: 55750148e559 ("i2c: synquacer: Fix an error handling path in synquacer_i2c_probe()") Cc: stable@vger.kernel.org # v6.10+ Cc: Andi Shyti Cc: Christophe JAILLET Signed-off-by: Ard Biesheuvel Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-synquacer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-synquacer.c b/drivers/i2c/busses/i2c-synquacer.c index 4eccbcd0fbfc0..bbb9062669e4b 100644 --- a/drivers/i2c/busses/i2c-synquacer.c +++ b/drivers/i2c/busses/i2c-synquacer.c @@ -550,12 +550,13 @@ static int synquacer_i2c_probe(struct platform_device *pdev) device_property_read_u32(&pdev->dev, "socionext,pclk-rate", &i2c->pclkrate); - pclk = devm_clk_get_enabled(&pdev->dev, "pclk"); + pclk = devm_clk_get_optional_enabled(&pdev->dev, "pclk"); if (IS_ERR(pclk)) return dev_err_probe(&pdev->dev, PTR_ERR(pclk), "failed to get and enable clock\n"); - i2c->pclkrate = clk_get_rate(pclk); + if (pclk) + i2c->pclkrate = clk_get_rate(pclk); if (i2c->pclkrate < SYNQUACER_I2C_MIN_CLK_RATE || i2c->pclkrate > SYNQUACER_I2C_MAX_CLK_RATE) From c085f6ca956f75d40422db96eaa6298867db8dca Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 29 Jul 2024 16:02:02 +0800 Subject: [PATCH 269/352] ceph: rename ceph_flush_cap_releases() to ceph_flush_session_cap_releases() Prepare for adding a helper to flush the cap releases for all sessions. Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 +- fs/ceph/mds_client.c | 10 +++++----- fs/ceph/mds_client.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6561a6cd94de2..888bf9a290a83 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4603,7 +4603,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, __ceph_queue_cap_release(session, cap); spin_unlock(&session->s_cap_lock); } - ceph_flush_cap_releases(mdsc, session); + ceph_flush_session_cap_releases(mdsc, session); goto done; bad: diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 276e34ab3e2cc..36c5886b5aac0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2266,7 +2266,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc, trim_caps - remaining); } - ceph_flush_cap_releases(mdsc, session); + ceph_flush_session_cap_releases(mdsc, session); return 0; } @@ -2420,7 +2420,7 @@ static void ceph_cap_release_work(struct work_struct *work) ceph_put_mds_session(session); } -void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, +void ceph_flush_session_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; @@ -2447,7 +2447,7 @@ void __ceph_queue_cap_release(struct ceph_mds_session *session, session->s_num_cap_releases++; if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE)) - ceph_flush_cap_releases(session->s_mdsc, session); + ceph_flush_session_cap_releases(session->s_mdsc, session); } static void ceph_cap_reclaim_work(struct work_struct *work) @@ -4340,7 +4340,7 @@ static void handle_session(struct ceph_mds_session *session, /* flush cap releases */ spin_lock(&session->s_cap_lock); if (session->s_num_cap_releases) - ceph_flush_cap_releases(mdsc, session); + ceph_flush_session_cap_releases(mdsc, session); spin_unlock(&session->s_cap_lock); send_flushmsg_ack(mdsc, session, seq); @@ -5446,7 +5446,7 @@ static void delayed_work(struct work_struct *work) } mutex_unlock(&mdsc->mutex); - ceph_flush_cap_releases(mdsc, s); + ceph_flush_session_cap_releases(mdsc, s); mutex_lock(&s->s_mutex); if (renew_caps) diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 585ab5a6d87d5..3dd54587944ac 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -599,8 +599,8 @@ extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq); extern void __ceph_queue_cap_release(struct ceph_mds_session *session, struct ceph_cap *cap); -extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session); +extern void ceph_flush_session_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc); From adc52461767f675264f2876d61e7220c113023e8 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 29 Jul 2024 16:04:11 +0800 Subject: [PATCH 270/352] ceph: flush all caps releases when syncing the whole filesystem We have hit a race between cap releases and cap revoke request that will cause the check_caps() to miss sending a cap revoke ack to MDS. And the client will depend on the cap release to release that revoking caps, which could be delayed for some unknown reasons. In Kclient we have figured out the RCA about race and we need a way to explictly trigger this manually could help to get rid of the caps revoke stuck issue. Link: https://tracker.ceph.com/issues/67221 Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 22 ++++++++++++++++++++++ fs/ceph/mds_client.c | 1 + fs/ceph/super.c | 1 + fs/ceph/super.h | 1 + 4 files changed, 25 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 888bf9a290a83..329516b1eaffe 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4702,6 +4702,28 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true); } +/* + * Flush all cap releases to the mds + */ +static void flush_cap_releases(struct ceph_mds_session *s) +{ + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_client *cl = mdsc->fsc->client; + + doutc(cl, "begin\n"); + spin_lock(&s->s_cap_lock); + if (s->s_num_cap_releases) + ceph_flush_session_cap_releases(mdsc, s); + spin_unlock(&s->s_cap_lock); + doutc(cl, "done\n"); + +} + +void ceph_flush_cap_releases(struct ceph_mds_client *mdsc) +{ + ceph_mdsc_iterate_sessions(mdsc, flush_cap_releases, true); +} + void __ceph_touch_fmode(struct ceph_inode_info *ci, struct ceph_mds_client *mdsc, int fmode) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 36c5886b5aac0..9682c6ae38662 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -5877,6 +5877,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) mutex_unlock(&mdsc->mutex); ceph_flush_dirty_caps(mdsc); + ceph_flush_cap_releases(mdsc); spin_lock(&mdsc->cap_dirty_lock); want_flush = mdsc->last_cap_flush_tid; if (!list_empty(&mdsc->cap_flush_list)) { diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 0cdf84cd17912..73f321b52895e 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -126,6 +126,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait) if (!wait) { doutc(cl, "(non-blocking)\n"); ceph_flush_dirty_caps(fsc->mdsc); + ceph_flush_cap_releases(fsc->mdsc); doutc(cl, "(non-blocking) done\n"); return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c88bf53f68e9f..0020746622fd0 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1268,6 +1268,7 @@ extern bool __ceph_should_report_size(struct ceph_inode_info *ci); extern void ceph_check_caps(struct ceph_inode_info *ci, int flags); extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc); extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); +extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc); extern int ceph_drop_caps_for_unlink(struct inode *inode); extern int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force); From d97079e97eab20e08afc507f2bed4501e2824717 Mon Sep 17 00:00:00 2001 From: "Luis Henriques (SUSE)" Date: Mon, 19 Aug 2024 10:52:17 +0100 Subject: [PATCH 271/352] ceph: fix a memory leak on cap_auths in MDS client The cap_auths that are allocated during an MDS session opening are never released, causing a memory leak detected by kmemleak. Fix this by freeing the memory allocated when shutting down the MDS client. Fixes: 1d17de9534cb ("ceph: save cap_auths in MDS client when session is opened") Signed-off-by: Luis Henriques (SUSE) Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 9682c6ae38662..59eb13aabc81c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -6016,6 +6016,18 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) ceph_mdsmap_destroy(mdsc->mdsmap); kfree(mdsc->sessions); ceph_caps_finalize(mdsc); + + if (mdsc->s_cap_auths) { + int i; + + for (i = 0; i < mdsc->s_cap_auths_num; i++) { + kfree(mdsc->s_cap_auths[i].match.gids); + kfree(mdsc->s_cap_auths[i].match.path); + kfree(mdsc->s_cap_auths[i].match.fs_name); + } + kfree(mdsc->s_cap_auths); + } + ceph_pool_perm_destroy(mdsc); } From 0039aebfe87129fae1e3567cb6de7a99dbb3ba28 Mon Sep 17 00:00:00 2001 From: Yan Zhen Date: Thu, 5 Sep 2024 19:32:27 +0800 Subject: [PATCH 272/352] ceph: Fix typo in the comment Correctly spelled comments make it easier for the reader to understand the code. replace 'tagert' with 'target' in the comment & replace 'vaild' with 'valid' in the comment & replace 'carefull' with 'careful' in the comment & replace 'trsaverse' with 'traverse' in the comment. Signed-off-by: Yan Zhen Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 +- fs/ceph/dir.c | 2 +- fs/ceph/inode.c | 2 +- fs/ceph/mds_client.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 329516b1eaffe..bed34fc11c919 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4150,7 +4150,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, ceph_remove_cap(mdsc, cap, false); goto out_unlock; } else if (tsession) { - /* add placeholder for the export tagert */ + /* add placeholder for the export target */ int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; tcap = new_cap; ceph_add_cap(inode, tsession, t_cap_id, issued, 0, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 18c72b305858d..e23f8a40f3e33 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -2059,7 +2059,7 @@ static int ceph_d_delete(const struct dentry *dentry) return 0; if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) return 0; - /* vaild lease? */ + /* valid lease? */ di = ceph_dentry(dentry); if (di) { if (__dentry_lease_is_valid(di)) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 71cd70514efa5..5bf2bcf38d8c5 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1778,7 +1778,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) if (err < 0) goto done; } else if (rinfo->head->is_dentry && req->r_dentry) { - /* parent inode is not locked, be carefull */ + /* parent inode is not locked, be careful */ struct ceph_vino *ptvino = NULL; dvino.ino = le64_to_cpu(rinfo->diri.in->ino); dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 59eb13aabc81c..c4a5fd94bbbb3 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4910,7 +4910,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, } else { recon_state.msg_version = 2; } - /* trsaverse this session's caps */ + /* traverse this session's caps */ err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state); spin_lock(&session->s_cap_lock); From 74249188f31827cf1eeeee8e06474c2fbe2fc1d2 Mon Sep 17 00:00:00 2001 From: Zhang Zekun Date: Fri, 6 Sep 2024 14:01:34 +0800 Subject: [PATCH 273/352] ceph: Remove empty definition in header file The real definition of ceph_acl_chmod() has been removed since commit 4db658ea0ca2 ("ceph: Fix up after semantic merge conflict"), remain the empty definition untouched in the header files. Let's remove the empty definition. Signed-off-by: Zhang Zekun Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/super.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0020746622fd0..2508aa8950b73 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1206,10 +1206,6 @@ static inline void ceph_init_inode_acls(struct inode *inode, struct ceph_acl_sec_ctx *as_ctx) { } -static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) -{ - return 0; -} static inline void ceph_forget_all_cached_acls(struct inode *inode) { From c08dfb1b49492c09cf13838c71897493ea3b424e Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 5 Sep 2024 06:22:18 +0800 Subject: [PATCH 274/352] ceph: remove the incorrect Fw reference check when dirtying pages When doing the direct-io reads it will also try to mark pages dirty, but for the read path it won't hold the Fw caps and there is case will it get the Fw reference. Fixes: 5dda377cf0a6 ("ceph: set i_head_snapc when getting CEPH_CAP_FILE_WR reference") Signed-off-by: Xiubo Li Reviewed-by: Patrick Donnelly Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c4744a02db753..0df4623785ddb 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -95,7 +95,6 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) /* dirty the head */ spin_lock(&ci->i_ceph_lock); - BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference if (__ceph_have_pending_cap_snap(ci)) { struct ceph_cap_snap *capsnap = list_last_entry(&ci->i_cap_snaps, From e860513f56d8428fcb2bd0282ac8ab691a53fc6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Wed, 18 Sep 2024 22:04:39 +0300 Subject: [PATCH 275/352] drm/i915/dp: Fix colorimetry detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit intel_dp_init_connector() is no place for detecting stuff via DPCD (except perhaps for eDP). Move the colorimetry stuff into a more appropriate place. Cc: Jouni Högander Fixes: 00076671a648 ("drm/i915/display: Move colorimetry_support from intel_psr to intel_dp") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240918190441.29071-1-ville.syrjala@linux.intel.com Reviewed-by: Jouni Högander (cherry picked from commit 35dba4834bded843d5416e8caadfe82bd0ce1904) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/intel_dp.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index b1b512e258abc..90fa73575feb1 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -4067,6 +4067,9 @@ intel_edp_init_dpcd(struct intel_dp *intel_dp, struct intel_connector *connector drm_dp_is_branch(intel_dp->dpcd)); intel_init_dpcd_quirks(intel_dp, &intel_dp->desc.ident); + intel_dp->colorimetry_support = + intel_dp_get_colorimetry_status(intel_dp); + /* * Read the eDP display control registers. * @@ -4180,6 +4183,9 @@ intel_dp_get_dpcd(struct intel_dp *intel_dp) intel_init_dpcd_quirks(intel_dp, &intel_dp->desc.ident); + intel_dp->colorimetry_support = + intel_dp_get_colorimetry_status(intel_dp); + intel_dp_update_sink_caps(intel_dp); } @@ -6931,9 +6937,6 @@ intel_dp_init_connector(struct intel_digital_port *dig_port, "HDCP init failed, skipping.\n"); } - intel_dp->colorimetry_support = - intel_dp_get_colorimetry_status(intel_dp); - intel_dp->frl.is_trained = false; intel_dp->frl.trained_rate_gbps = 0; From 4771d2ecb7b9e4c2c73ede2908d7e7c989460981 Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Wed, 18 Sep 2024 17:07:13 +0800 Subject: [PATCH 276/352] drm/amdgpu/mes12: set enable_level_process_quantum_check enable_level_process_quantum_check is requried to enable process quantum based scheduling. Signed-off-by: Jack Xiao Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.11.x --- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 186f778133974..8d27421689c9d 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -609,6 +609,7 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe) mes_set_hw_res_pkt.disable_mes_log = 1; mes_set_hw_res_pkt.use_different_vmid_compute = 1; mes_set_hw_res_pkt.enable_reg_active_poll = 1; + mes_set_hw_res_pkt.enable_level_process_quantum_check = 1; /* * Keep oversubscribe timer for sdma . When we have unmapped doorbell From 126be9b2bef9c7068fdd464790d82e6d70f9d8e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 21 Aug 2024 13:55:41 +0200 Subject: [PATCH 277/352] drm/amdgpu: sync to KFD fences before clearing PTEs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch tries to solve the basic problem we also need to sync to the KFD fences of the BO because otherwise it can be that we clear PTEs while the KFD queues are still running. Signed-off-by: Christian König Acked-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 30 ++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 +++++ 3 files changed, 37 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c index bdf1ef825d896..c586ab4c911bf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c @@ -260,6 +260,36 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync, return 0; } +/** + * amdgpu_sync_kfd - sync to KFD fences + * + * @sync: sync object to add KFD fences to + * @resv: reservation object with KFD fences + * + * Extract all KFD fences and add them to the sync object. + */ +int amdgpu_sync_kfd(struct amdgpu_sync *sync, struct dma_resv *resv) +{ + struct dma_resv_iter cursor; + struct dma_fence *f; + int r = 0; + + dma_resv_iter_begin(&cursor, resv, DMA_RESV_USAGE_BOOKKEEP); + dma_resv_for_each_fence_unlocked(&cursor, f) { + void *fence_owner = amdgpu_sync_get_owner(f); + + if (fence_owner != AMDGPU_FENCE_OWNER_KFD) + continue; + + r = amdgpu_sync_fence(sync, f); + if (r) + break; + } + dma_resv_iter_end(&cursor); + + return r; +} + /* Free the entry back to the slab */ static void amdgpu_sync_entry_free(struct amdgpu_sync_entry *e) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h index cf1e9e858efdc..e3272dce798d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h @@ -51,6 +51,7 @@ int amdgpu_sync_fence(struct amdgpu_sync *sync, struct dma_fence *f); int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync, struct dma_resv *resv, enum amdgpu_sync_mode mode, void *owner); +int amdgpu_sync_kfd(struct amdgpu_sync *sync, struct dma_resv *resv); struct dma_fence *amdgpu_sync_peek_fence(struct amdgpu_sync *sync, struct amdgpu_ring *ring); struct dma_fence *amdgpu_sync_get_fence(struct amdgpu_sync *sync); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 985b89242b44f..6005280f5f38f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1169,6 +1169,12 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, AMDGPU_SYNC_EQ_OWNER, vm); if (r) goto error_free; + if (bo) { + r = amdgpu_sync_kfd(&sync, bo->tbo.base.resv); + if (r) + goto error_free; + } + } else { struct drm_gem_object *obj = &bo->tbo.base; From e1d27f7a9cea1e0c06699164e3b177862e7b4096 Mon Sep 17 00:00:00 2001 From: ZhenGuo Yin Date: Thu, 19 Sep 2024 11:38:04 +0800 Subject: [PATCH 278/352] drm/amdgpu: skip coredump after job timeout in SRIOV VF FLR will be triggered by host driver before job timeout, hence the error status of GPU get cleared. Performing a coredump here is unnecessary. Signed-off-by: ZhenGuo Yin Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index ad6bf5d4e0a9d..16f2605ac50b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -107,8 +107,11 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) /* * Do the coredump immediately after a job timeout to get a very * close dump/snapshot/representation of GPU's current error status + * Skip it for SRIOV, since VF FLR will be triggered by host driver + * before job timeout */ - amdgpu_job_core_dump(adev, job); + if (!amdgpu_sriov_vf(adev)) + amdgpu_job_core_dump(adev, job); if (amdgpu_gpu_recovery && amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { From 6ae9e1aba97e4cdaa31a0bfdc07497ad0e915c84 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Mon, 16 Sep 2024 14:33:58 -0400 Subject: [PATCH 279/352] drm/amdkfd: Update logic for CU occupancy calculations Currently, the code uses the IH_VMID_X_LUT register to map a queue's vmid to the corresponding PASID. This logic is racy since CP can update the VMID-PASID mapping anytime especially when there are more processes than number of vmids. Update the logic to calculate CU occupancy by matching doorbell offset of the queue with valid wave counts against the process's queues. Signed-off-by: Mukul Joshi Reviewed-by: Harish Kasiviswanathan Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 102 ++++++++---------- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 5 +- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 20 ++++ .../drm/amd/amdkfd/kfd_device_queue_manager.h | 3 + drivers/gpu/drm/amd/amdkfd/kfd_process.c | 14 ++- .../gpu/drm/amd/include/kgd_kfd_interface.h | 10 +- 6 files changed, 89 insertions(+), 65 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index 1254a43ec96b6..26b1f37c316e3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -950,28 +950,30 @@ static void unlock_spi_csq_mutexes(struct amdgpu_device *adev) * @inst: xcc's instance number on a multi-XCC setup */ static void get_wave_count(struct amdgpu_device *adev, int queue_idx, - int *wave_cnt, int *vmid, uint32_t inst) + struct kfd_cu_occupancy *queue_cnt, uint32_t inst) { int pipe_idx; int queue_slot; unsigned int reg_val; - + unsigned int wave_cnt; /* * Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID * parameters to read out waves in flight. Get VMID if there are * non-zero waves in flight. */ - *vmid = 0xFF; - *wave_cnt = 0; pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe; queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe; soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, inst); - reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst, mmSPI_CSQ_WF_ACTIVE_COUNT_0) + - queue_slot); - *wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK; - if (*wave_cnt != 0) - *vmid = (RREG32_SOC15(GC, inst, mmCP_HQD_VMID) & - CP_HQD_VMID__VMID_MASK) >> CP_HQD_VMID__VMID__SHIFT; + reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst, + mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot); + wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK; + if (wave_cnt != 0) { + queue_cnt->wave_cnt += wave_cnt; + queue_cnt->doorbell_off = + (RREG32_SOC15(GC, inst, mmCP_HQD_PQ_DOORBELL_CONTROL) & + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >> + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; + } } /** @@ -981,9 +983,8 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx, * or more queues running and submitting waves to compute units. * * @adev: Handle of device from which to get number of waves in flight - * @pasid: Identifies the process for which this query call is invoked - * @pasid_wave_cnt: Output parameter updated with number of waves in flight that - * belong to process with given pasid + * @cu_occupancy: Array that gets filled with wave_cnt and doorbell offset + * for comparison later. * @max_waves_per_cu: Output parameter updated with maximum number of waves * possible per Compute Unit * @inst: xcc's instance number on a multi-XCC setup @@ -1011,30 +1012,24 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx, * number of waves that are in flight for the queue at specified index. The * index ranges from 0 to 7. * - * If non-zero waves are in flight, read CP_HQD_VMID register to obtain VMID - * of the wave(s). + * If non-zero waves are in flight, store the corresponding doorbell offset + * of the queue, along with the wave count. * - * Determine if VMID from above step maps to pasid provided as parameter. If - * it matches agrregate the wave count. That the VMID will not match pasid is - * a normal condition i.e. a device is expected to support multiple queues - * from multiple proceses. + * Determine if the queue belongs to the process by comparing the doorbell + * offset against the process's queues. If it matches, aggregate the wave + * count for the process. * * Reading registers referenced above involves programming GRBM appropriately */ -void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid, - int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst) +void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, + struct kfd_cu_occupancy *cu_occupancy, + int *max_waves_per_cu, uint32_t inst) { int qidx; - int vmid; int se_idx; - int sh_idx; int se_cnt; - int sh_cnt; - int wave_cnt; int queue_map; - int pasid_tmp; int max_queue_cnt; - int vmid_wave_cnt = 0; DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES); lock_spi_csq_mutexes(adev); @@ -1048,42 +1043,30 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid, AMDGPU_MAX_QUEUES); max_queue_cnt = adev->gfx.mec.num_pipe_per_mec * adev->gfx.mec.num_queue_per_pipe; - sh_cnt = adev->gfx.config.max_sh_per_se; se_cnt = adev->gfx.config.max_shader_engines; for (se_idx = 0; se_idx < se_cnt; se_idx++) { - for (sh_idx = 0; sh_idx < sh_cnt; sh_idx++) { + amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst); + queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS); + + /* + * Assumption: queue map encodes following schema: four + * pipes per each micro-engine, with each pipe mapping + * eight queues. This schema is true for GFX9 devices + * and must be verified for newer device families + */ + for (qidx = 0; qidx < max_queue_cnt; qidx++) { + /* Skip qeueus that are not associated with + * compute functions + */ + if (!test_bit(qidx, cp_queue_bitmap)) + continue; - amdgpu_gfx_select_se_sh(adev, se_idx, sh_idx, 0xffffffff, inst); - queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS); + if (!(queue_map & (1 << qidx))) + continue; - /* - * Assumption: queue map encodes following schema: four - * pipes per each micro-engine, with each pipe mapping - * eight queues. This schema is true for GFX9 devices - * and must be verified for newer device families - */ - for (qidx = 0; qidx < max_queue_cnt; qidx++) { - - /* Skip qeueus that are not associated with - * compute functions - */ - if (!test_bit(qidx, cp_queue_bitmap)) - continue; - - if (!(queue_map & (1 << qidx))) - continue; - - /* Get number of waves in flight and aggregate them */ - get_wave_count(adev, qidx, &wave_cnt, &vmid, - inst); - if (wave_cnt != 0) { - pasid_tmp = - RREG32(SOC15_REG_OFFSET(OSSSYS, inst, - mmIH_VMID_0_LUT) + vmid); - if (pasid_tmp == pasid) - vmid_wave_cnt += wave_cnt; - } - } + /* Get number of waves in flight and aggregate them */ + get_wave_count(adev, qidx, &cu_occupancy[qidx], + inst); } } @@ -1092,7 +1075,6 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid, unlock_spi_csq_mutexes(adev); /* Update the output parameters and return */ - *pasid_wave_cnt = vmid_wave_cnt; *max_waves_per_cu = adev->gfx.cu_info.simd_per_cu * adev->gfx.cu_info.max_waves_per_simd; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h index 988c50ac3be01..b6a91a552aa43 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h @@ -52,8 +52,9 @@ bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev, uint8_t vmid, uint16_t *p_pasid); void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev, uint32_t vmid, uint64_t page_table_base); -void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid, - int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst); +void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, + struct kfd_cu_occupancy *cu_occupancy, + int *max_waves_per_cu, uint32_t inst); void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev, uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr, uint32_t inst); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 71b465f8d83ee..29578550b4781 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -3540,6 +3540,26 @@ int debug_refresh_runlist(struct device_queue_manager *dqm) return debug_map_and_unlock(dqm); } +bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + int doorbell_off) +{ + struct queue *q; + bool r = false; + + dqm_lock(dqm); + + list_for_each_entry(q, &qpd->queues_list, list) { + if (q->properties.doorbell_off == doorbell_off) { + r = true; + goto out; + } + } + +out: + dqm_unlock(dqm); + return r; +} #if defined(CONFIG_DEBUG_FS) static void seq_reg_dump(struct seq_file *m, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 08b40826ad1ef..80be2036abeac 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -324,6 +324,9 @@ void set_queue_snapshot_entry(struct queue *q, int debug_lock_and_unmap(struct device_queue_manager *dqm); int debug_map_and_unlock(struct device_queue_manager *dqm); int debug_refresh_runlist(struct device_queue_manager *dqm); +bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + int doorbell_off); static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index a902950cc0601..d73841268c9ba 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -270,6 +270,10 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer) struct kfd_node *dev = NULL; struct kfd_process *proc = NULL; struct kfd_process_device *pdd = NULL; + int i; + struct kfd_cu_occupancy cu_occupancy[AMDGPU_MAX_QUEUES]; + + memset(cu_occupancy, 0x0, sizeof(cu_occupancy)); pdd = container_of(attr, struct kfd_process_device, attr_cu_occupancy); dev = pdd->dev; @@ -287,9 +291,17 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer) /* Collect wave count from device if it supports */ wave_cnt = 0; max_waves_per_cu = 0; - dev->kfd2kgd->get_cu_occupancy(dev->adev, proc->pasid, &wave_cnt, + + dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy, &max_waves_per_cu, 0); + for (i = 0; i < AMDGPU_MAX_QUEUES; i++) { + if (cu_occupancy[i].wave_cnt != 0 && + kfd_dqm_is_queue_in_process(dev->dqm, &pdd->qpd, + cu_occupancy[i].doorbell_off)) + wave_cnt += cu_occupancy[i].wave_cnt; + } + /* Translate wave count to number of compute units */ cu_cnt = (wave_cnt + (max_waves_per_cu - 1)) / max_waves_per_cu; return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt); diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h index 7744ca3ef4b19..e3e635a31b8a4 100644 --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h @@ -71,6 +71,11 @@ enum kgd_memory_pool { KGD_POOL_FRAMEBUFFER = 3, }; +struct kfd_cu_occupancy { + u32 wave_cnt; + u32 doorbell_off; +}; + /** * enum kfd_sched_policy * @@ -313,8 +318,9 @@ struct kfd2kgd_calls { uint32_t grace_period, uint32_t *reg_offset, uint32_t *reg_data); - void (*get_cu_occupancy)(struct amdgpu_device *adev, int pasid, - int *wave_cnt, int *max_waves_per_cu, uint32_t inst); + void (*get_cu_occupancy)(struct amdgpu_device *adev, + struct kfd_cu_occupancy *cu_occupancy, + int *max_waves_per_cu, uint32_t inst); void (*program_trap_handler_settings)(struct amdgpu_device *adev, uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr, uint32_t inst); From e45b011d2c4146442a388113657b70f0c7cad09b Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Fri, 20 Sep 2024 14:59:29 -0400 Subject: [PATCH 280/352] drm/amdkfd: Fix CU occupancy for GFX 9.4.3 Make CU occupancy calculations work on GFX 9.4.3 by updating the logic to handle multiple XCCs correctly. Signed-off-by: Mukul Joshi Reviewed-by: Harish Kasiviswanathan Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 12 +++++------ .../drm/amd/amdkfd/kfd_device_queue_manager.c | 6 +++++- .../drm/amd/amdkfd/kfd_device_queue_manager.h | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 20 ++++++++++++++++--- 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index 26b1f37c316e3..3bc0cbf45bc59 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -963,14 +963,14 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx, */ pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe; queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe; - soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, inst); - reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst, + soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, GET_INST(GC, inst)); + reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot); wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK; if (wave_cnt != 0) { queue_cnt->wave_cnt += wave_cnt; queue_cnt->doorbell_off = - (RREG32_SOC15(GC, inst, mmCP_HQD_PQ_DOORBELL_CONTROL) & + (RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_DOORBELL_CONTROL) & CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >> CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; } @@ -1033,7 +1033,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES); lock_spi_csq_mutexes(adev); - soc15_grbm_select(adev, 1, 0, 0, 0, inst); + soc15_grbm_select(adev, 1, 0, 0, 0, GET_INST(GC, inst)); /* * Iterate through the shader engines and arrays of the device @@ -1046,7 +1046,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, se_cnt = adev->gfx.config.max_shader_engines; for (se_idx = 0; se_idx < se_cnt; se_idx++) { amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst); - queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS); + queue_map = RREG32_SOC15(GC, GET_INST(GC, inst), mmSPI_CSQ_WF_ACTIVE_STATUS); /* * Assumption: queue map encodes following schema: four @@ -1071,7 +1071,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, } amdgpu_gfx_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, inst); - soc15_grbm_select(adev, 0, 0, 0, 0, inst); + soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, inst)); unlock_spi_csq_mutexes(adev); /* Update the output parameters and return */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 29578550b4781..648f40091aa39 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -3542,15 +3542,19 @@ int debug_refresh_runlist(struct device_queue_manager *dqm) bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm, struct qcm_process_device *qpd, - int doorbell_off) + int doorbell_off, u32 *queue_format) { struct queue *q; bool r = false; + if (!queue_format) + return r; + dqm_lock(dqm); list_for_each_entry(q, &qpd->queues_list, list) { if (q->properties.doorbell_off == doorbell_off) { + *queue_format = q->properties.format; r = true; goto out; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 80be2036abeac..09ab36f8e8c69 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -326,7 +326,7 @@ int debug_map_and_unlock(struct device_queue_manager *dqm); int debug_refresh_runlist(struct device_queue_manager *dqm); bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm, struct qcm_process_device *qpd, - int doorbell_off); + int doorbell_off, u32 *queue_format); static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index d73841268c9ba..d07acf1b2f93c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -272,6 +272,7 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer) struct kfd_process_device *pdd = NULL; int i; struct kfd_cu_occupancy cu_occupancy[AMDGPU_MAX_QUEUES]; + u32 queue_format; memset(cu_occupancy, 0x0, sizeof(cu_occupancy)); @@ -292,14 +293,27 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer) wave_cnt = 0; max_waves_per_cu = 0; + /* + * For GFX 9.4.3, fetch the CU occupancy from the first XCC in the partition. + * For AQL queues, because of cooperative dispatch we multiply the wave count + * by number of XCCs in the partition to get the total wave counts across all + * XCCs in the partition. + * For PM4 queues, there is no cooperative dispatch so wave_cnt stay as it is. + */ dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy, - &max_waves_per_cu, 0); + &max_waves_per_cu, ffs(dev->xcc_mask) - 1); for (i = 0; i < AMDGPU_MAX_QUEUES; i++) { if (cu_occupancy[i].wave_cnt != 0 && kfd_dqm_is_queue_in_process(dev->dqm, &pdd->qpd, - cu_occupancy[i].doorbell_off)) - wave_cnt += cu_occupancy[i].wave_cnt; + cu_occupancy[i].doorbell_off, + &queue_format)) { + if (unlikely(queue_format == KFD_QUEUE_FORMAT_PM4)) + wave_cnt += cu_occupancy[i].wave_cnt; + else + wave_cnt += (NUM_XCC(dev->xcc_mask) * + cu_occupancy[i].wave_cnt); + } } /* Translate wave count to number of compute units */ From 8048e5ade8224969023902b0b3f64470f9c250a7 Mon Sep 17 00:00:00 2001 From: Saleemkhan Jamadar Date: Fri, 20 Sep 2024 18:40:18 +0530 Subject: [PATCH 281/352] drm/amdgpu/vcn: enable AV1 on both instances v1 - remove cs parse code (Christian) On VCN v4_0_6 AV1 is supported on both the instances. Remove cs IB parse code since explict handling of AV1 schedule is not required. Signed-off-by: Saleemkhan Jamadar Reviewed-by: Leo Liu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c | 165 ------------------------ 1 file changed, 165 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c index b1fd226b7efb4..9d4f5352a62c8 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c @@ -1395,170 +1395,6 @@ static void vcn_v4_0_5_unified_ring_set_wptr(struct amdgpu_ring *ring) } } -static int vcn_v4_0_5_limit_sched(struct amdgpu_cs_parser *p, - struct amdgpu_job *job) -{ - struct drm_gpu_scheduler **scheds; - - /* The create msg must be in the first IB submitted */ - if (atomic_read(&job->base.entity->fence_seq)) - return -EINVAL; - - /* if VCN0 is harvested, we can't support AV1 */ - if (p->adev->vcn.harvest_config & AMDGPU_VCN_HARVEST_VCN0) - return -EINVAL; - - scheds = p->adev->gpu_sched[AMDGPU_HW_IP_VCN_ENC] - [AMDGPU_RING_PRIO_0].sched; - drm_sched_entity_modify_sched(job->base.entity, scheds, 1); - return 0; -} - -static int vcn_v4_0_5_dec_msg(struct amdgpu_cs_parser *p, struct amdgpu_job *job, - uint64_t addr) -{ - struct ttm_operation_ctx ctx = { false, false }; - struct amdgpu_bo_va_mapping *map; - uint32_t *msg, num_buffers; - struct amdgpu_bo *bo; - uint64_t start, end; - unsigned int i; - void *ptr; - int r; - - addr &= AMDGPU_GMC_HOLE_MASK; - r = amdgpu_cs_find_mapping(p, addr, &bo, &map); - if (r) { - DRM_ERROR("Can't find BO for addr 0x%08llx\n", addr); - return r; - } - - start = map->start * AMDGPU_GPU_PAGE_SIZE; - end = (map->last + 1) * AMDGPU_GPU_PAGE_SIZE; - if (addr & 0x7) { - DRM_ERROR("VCN messages must be 8 byte aligned!\n"); - return -EINVAL; - } - - bo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; - amdgpu_bo_placement_from_domain(bo, bo->allowed_domains); - r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); - if (r) { - DRM_ERROR("Failed validating the VCN message BO (%d)!\n", r); - return r; - } - - r = amdgpu_bo_kmap(bo, &ptr); - if (r) { - DRM_ERROR("Failed mapping the VCN message (%d)!\n", r); - return r; - } - - msg = ptr + addr - start; - - /* Check length */ - if (msg[1] > end - addr) { - r = -EINVAL; - goto out; - } - - if (msg[3] != RDECODE_MSG_CREATE) - goto out; - - num_buffers = msg[2]; - for (i = 0, msg = &msg[6]; i < num_buffers; ++i, msg += 4) { - uint32_t offset, size, *create; - - if (msg[0] != RDECODE_MESSAGE_CREATE) - continue; - - offset = msg[1]; - size = msg[2]; - - if (offset + size > end) { - r = -EINVAL; - goto out; - } - - create = ptr + addr + offset - start; - - /* H264, HEVC and VP9 can run on any instance */ - if (create[0] == 0x7 || create[0] == 0x10 || create[0] == 0x11) - continue; - - r = vcn_v4_0_5_limit_sched(p, job); - if (r) - goto out; - } - -out: - amdgpu_bo_kunmap(bo); - return r; -} - -#define RADEON_VCN_ENGINE_TYPE_ENCODE (0x00000002) -#define RADEON_VCN_ENGINE_TYPE_DECODE (0x00000003) - -#define RADEON_VCN_ENGINE_INFO (0x30000001) -#define RADEON_VCN_ENGINE_INFO_MAX_OFFSET 16 - -#define RENCODE_ENCODE_STANDARD_AV1 2 -#define RENCODE_IB_PARAM_SESSION_INIT 0x00000003 -#define RENCODE_IB_PARAM_SESSION_INIT_MAX_OFFSET 64 - -/* return the offset in ib if id is found, -1 otherwise - * to speed up the searching we only search upto max_offset - */ -static int vcn_v4_0_5_enc_find_ib_param(struct amdgpu_ib *ib, uint32_t id, int max_offset) -{ - int i; - - for (i = 0; i < ib->length_dw && i < max_offset && ib->ptr[i] >= 8; i += ib->ptr[i]/4) { - if (ib->ptr[i + 1] == id) - return i; - } - return -1; -} - -static int vcn_v4_0_5_ring_patch_cs_in_place(struct amdgpu_cs_parser *p, - struct amdgpu_job *job, - struct amdgpu_ib *ib) -{ - struct amdgpu_ring *ring = amdgpu_job_ring(job); - struct amdgpu_vcn_decode_buffer *decode_buffer; - uint64_t addr; - uint32_t val; - int idx; - - /* The first instance can decode anything */ - if (!ring->me) - return 0; - - /* RADEON_VCN_ENGINE_INFO is at the top of ib block */ - idx = vcn_v4_0_5_enc_find_ib_param(ib, RADEON_VCN_ENGINE_INFO, - RADEON_VCN_ENGINE_INFO_MAX_OFFSET); - if (idx < 0) /* engine info is missing */ - return 0; - - val = amdgpu_ib_get_value(ib, idx + 2); /* RADEON_VCN_ENGINE_TYPE */ - if (val == RADEON_VCN_ENGINE_TYPE_DECODE) { - decode_buffer = (struct amdgpu_vcn_decode_buffer *)&ib->ptr[idx + 6]; - - if (!(decode_buffer->valid_buf_flag & 0x1)) - return 0; - - addr = ((u64)decode_buffer->msg_buffer_address_hi) << 32 | - decode_buffer->msg_buffer_address_lo; - return vcn_v4_0_5_dec_msg(p, job, addr); - } else if (val == RADEON_VCN_ENGINE_TYPE_ENCODE) { - idx = vcn_v4_0_5_enc_find_ib_param(ib, RENCODE_IB_PARAM_SESSION_INIT, - RENCODE_IB_PARAM_SESSION_INIT_MAX_OFFSET); - if (idx >= 0 && ib->ptr[idx + 2] == RENCODE_ENCODE_STANDARD_AV1) - return vcn_v4_0_5_limit_sched(p, job); - } - return 0; -} - static const struct amdgpu_ring_funcs vcn_v4_0_5_unified_ring_vm_funcs = { .type = AMDGPU_RING_TYPE_VCN_ENC, .align_mask = 0x3f, @@ -1566,7 +1402,6 @@ static const struct amdgpu_ring_funcs vcn_v4_0_5_unified_ring_vm_funcs = { .get_rptr = vcn_v4_0_5_unified_ring_get_rptr, .get_wptr = vcn_v4_0_5_unified_ring_get_wptr, .set_wptr = vcn_v4_0_5_unified_ring_set_wptr, - .patch_cs_in_place = vcn_v4_0_5_ring_patch_cs_in_place, .emit_frame_size = SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 4 + From d52ac79053a2f3eba04c1e7b56334df84d1d289f Mon Sep 17 00:00:00 2001 From: Sreekant Somasekharan Date: Fri, 20 Sep 2024 01:53:17 -0400 Subject: [PATCH 282/352] drm/amdkfd: Add SDMA queue quantum support for GFX12 program SDMAx_QUEUEx_SCHEDULE_CNTL for context switch due to quantum in KFD for GFX12. Signed-off-by: Sreekant Somasekharan Reviewed-by: Harish Kasiviswanathan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.11.x --- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c index d163d92a692f6..2b72d5b4949b6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c @@ -341,6 +341,10 @@ static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, m->sdmax_rlcx_doorbell_offset = q->doorbell_off << SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT; + m->sdmax_rlcx_sched_cntl = (amdgpu_sdma_phase_quantum + << SDMA0_QUEUE0_SCHEDULE_CNTL__CONTEXT_QUANTUM__SHIFT) + & SDMA0_QUEUE0_SCHEDULE_CNTL__CONTEXT_QUANTUM_MASK; + m->sdma_engine_id = q->sdma_engine_id; m->sdma_queue_id = q->sdma_queue_id; From d782d6e1d9078d6b82f8468dd6421050165e7d75 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 23 Sep 2024 22:39:11 +0900 Subject: [PATCH 283/352] ksmbd: remove unsafe_memcpy use in session setup Kees pointed out to just use directly ->Buffer instead of pointing ->Buffer using offset not to use unsafe_memcpy(). Suggested-by: Kees Cook Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 7121266daa020..72af3ab40b5c3 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1335,8 +1335,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, return rc; sz = le16_to_cpu(rsp->SecurityBufferOffset); - chgblob = - (struct challenge_message *)((char *)&rsp->hdr.ProtocolId + sz); + chgblob = (struct challenge_message *)rsp->Buffer; memset(chgblob, 0, sizeof(struct challenge_message)); if (!work->conn->use_spnego) { @@ -1369,9 +1368,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, goto out; } - sz = le16_to_cpu(rsp->SecurityBufferOffset); - unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len, - /* alloc is larger than blob, see smb2_allocate_rsp_buf() */); + memcpy(rsp->Buffer, spnego_blob, spnego_blob_len); rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len); out: @@ -1453,10 +1450,7 @@ static int ntlm_authenticate(struct ksmbd_work *work, if (rc) return -ENOMEM; - sz = le16_to_cpu(rsp->SecurityBufferOffset); - unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, - spnego_blob_len, - /* alloc is larger than blob, see smb2_allocate_rsp_buf() */); + memcpy(rsp->Buffer, spnego_blob, spnego_blob_len); rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len); kfree(spnego_blob); } From 8e2f6a0e2dc9db663b4ba2225822e7a3c4047bfb Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 24 Sep 2024 22:39:29 +0900 Subject: [PATCH 284/352] ksmbd: fix open failure from block and char device file char/block device file can't be opened with dentry_open() if device driver is not loaded. Use O_PATH flags for fake opening file to handle it if file is a block or char file. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 72af3ab40b5c3..7460089c186f0 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -2052,18 +2052,20 @@ int smb2_tree_connect(struct ksmbd_work *work) * @access: file access flags * @disposition: file disposition flags * @may_flags: set with MAY_ flags - * @is_dir: is creating open flags for directory + * @coptions: file creation options + * @mode: file mode * * Return: file open flags */ static int smb2_create_open_flags(bool file_present, __le32 access, __le32 disposition, int *may_flags, - bool is_dir) + __le32 coptions, + umode_t mode) { int oflags = O_NONBLOCK | O_LARGEFILE; - if (is_dir) { + if (coptions & FILE_DIRECTORY_FILE_LE || S_ISDIR(mode)) { access &= ~FILE_WRITE_DESIRE_ACCESS_LE; ksmbd_debug(SMB, "Discard write access to a directory\n"); } @@ -2080,7 +2082,7 @@ static int smb2_create_open_flags(bool file_present, __le32 access, *may_flags = MAY_OPEN | MAY_READ; } - if (access == FILE_READ_ATTRIBUTES_LE) + if (access == FILE_READ_ATTRIBUTES_LE || S_ISBLK(mode) || S_ISCHR(mode)) oflags |= O_PATH; if (file_present) { @@ -3175,8 +3177,8 @@ int smb2_open(struct ksmbd_work *work) open_flags = smb2_create_open_flags(file_present, daccess, req->CreateDisposition, &may_flags, - req->CreateOptions & FILE_DIRECTORY_FILE_LE || - (file_present && S_ISDIR(d_inode(path.dentry)->i_mode))); + req->CreateOptions, + file_present ? d_inode(path.dentry)->i_mode : 0); if (!test_tree_conn_flag(tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { if (open_flags & (O_CREAT | O_TRUNC)) { From 9e676e571d39eb6189bf6d55a9c401ba2dd13410 Mon Sep 17 00:00:00 2001 From: Shen Lichuan Date: Wed, 25 Sep 2024 15:43:23 +0800 Subject: [PATCH 285/352] ksmbd: Correct typos in multiple comments across various files Fixed some confusing typos that were currently identified witch codespell, the details are as follows: -in the code comments: fs/smb/common/smb2pdu.h:9: specfication ==> specification fs/smb/common/smb2pdu.h:494: usally ==> usually fs/smb/common/smb2pdu.h:1064: Attrubutes ==> Attributes fs/smb/server/connection.c:28: cleand ==> cleaned fs/smb/server/ksmbd_netlink.h:216: struture ==> structure fs/smb/server/oplock.c:799: conains ==> contains fs/smb/server/oplock.c:1487: containted ==> contained fs/smb/server/server.c:282: proccessing ==> processing fs/smb/server/smb_common.c:491: comforms ==> conforms fs/smb/server/xattr.h:102: ATTRIBUITE ==> ATTRIBUTE Signed-off-by: Shen Lichuan Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/common/smb2pdu.h | 6 +++--- fs/smb/server/connection.c | 2 +- fs/smb/server/ksmbd_netlink.h | 2 +- fs/smb/server/oplock.c | 4 ++-- fs/smb/server/server.c | 2 +- fs/smb/server/smb_common.c | 2 +- fs/smb/server/xattr.h | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index c769f9dbc0b46..9f272cc8f5660 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -6,7 +6,7 @@ * Note that, due to trying to use names similar to the protocol specifications, * there are many mixed case field names in the structures below. Although * this does not match typical Linux kernel style, it is necessary to be - * able to match against the protocol specfication. + * able to match against the protocol specification. * * SMB2 commands * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses @@ -491,7 +491,7 @@ struct smb2_encryption_neg_context { __le16 ContextType; /* 2 */ __le16 DataLength; __le32 Reserved; - /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */ + /* CipherCount usually 2, but can be 3 when AES256-GCM enabled */ __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */ __le16 Ciphers[]; } __packed; @@ -1061,7 +1061,7 @@ struct smb2_server_client_notification { #define IL_IMPERSONATION cpu_to_le32(0x00000002) #define IL_DELEGATE cpu_to_le32(0x00000003) -/* File Attrubutes */ +/* File Attributes */ #define FILE_ATTRIBUTE_READONLY 0x00000001 #define FILE_ATTRIBUTE_HIDDEN 0x00000002 #define FILE_ATTRIBUTE_SYSTEM 0x00000004 diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index cac80e7bfefc7..aa2a37a7ce847 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -25,7 +25,7 @@ DECLARE_RWSEM(conn_list_lock); /** * ksmbd_conn_free() - free resources of the connection instance * - * @conn: connection instance to be cleand up + * @conn: connection instance to be cleaned up * * During the thread termination, the corresponding conn instance * resources(sock/memory) are released and finally the conn object is freed. diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h index f4e55199938d5..38e6fd2da3b80 100644 --- a/fs/smb/server/ksmbd_netlink.h +++ b/fs/smb/server/ksmbd_netlink.h @@ -213,7 +213,7 @@ struct ksmbd_tree_connect_response { }; /* - * IPC Request struture to disconnect tree connection. + * IPC Request structure to disconnect tree connection. */ struct ksmbd_tree_disconnect_request { __u64 session_id; /* session id */ diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 246cde380dfba..4142c7ad5fa91 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -796,7 +796,7 @@ static void __smb2_lease_break_noti(struct work_struct *wk) /** * smb2_lease_break_noti() - break lease when a new client request * write lease - * @opinfo: conains lease state information + * @opinfo: contains lease state information * * Return: 0 on success, otherwise error */ @@ -1484,7 +1484,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease) } /** - * parse_lease_state() - parse lease context containted in file open request + * parse_lease_state() - parse lease context contained in file open request * @open_req: buffer containing smb2 file open(create) request * * Return: allocated lease context object on success, otherwise NULL diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index c402d4abe8263..231d2d224656b 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -279,7 +279,7 @@ static void handle_ksmbd_work(struct work_struct *wk) /** * queue_ksmbd_work() - queue a smb request to worker thread queue - * for proccessing smb command and sending response + * for processing smb command and sending response * @conn: connection instance * * read remaining data from socket create and submit work. diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index cc4bb2377cbda..5b8d75e78ffb8 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -488,7 +488,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, * @shortname: destination short filename * * Return: shortname length or 0 when source long name is '.' or '..' - * TODO: Though this function comforms the restriction of 8.3 Filename spec, + * TODO: Though this function conforms the restriction of 8.3 Filename spec, * but the result is different with Windows 7's one. need to check. */ int ksmbd_extract_shortname(struct ksmbd_conn *conn, const char *longname, diff --git a/fs/smb/server/xattr.h b/fs/smb/server/xattr.h index fa3e27d6971b8..505101a8104c6 100644 --- a/fs/smb/server/xattr.h +++ b/fs/smb/server/xattr.h @@ -99,7 +99,7 @@ struct xattr_ntacl { __u8 posix_acl_hash[XATTR_SD_HASH_SIZE]; /* 64bytes hash for posix acl */ }; -/* DOS ATTRIBUITE XATTR PREFIX */ +/* DOS ATTRIBUTE XATTR PREFIX */ #define DOS_ATTRIBUTE_PREFIX "DOSATTRIB" #define DOS_ATTRIBUTE_PREFIX_LEN (sizeof(DOS_ATTRIBUTE_PREFIX) - 1) #define XATTR_NAME_DOS_ATTRIBUTE (XATTR_USER_PREFIX DOS_ATTRIBUTE_PREFIX) From 5b97eebcce1b4f3f07a71f635d6aa3af96c236e7 Mon Sep 17 00:00:00 2001 From: Qianqiang Liu Date: Wed, 25 Sep 2024 13:29:36 +0800 Subject: [PATCH 286/352] fbcon: Fix a NULL pointer dereference issue in fbcon_putcs syzbot has found a NULL pointer dereference bug in fbcon. Here is the simplified C reproducer: struct param { uint8_t type; struct tiocl_selection ts; }; int main() { struct fb_con2fbmap con2fb; struct param param; int fd = open("/dev/fb1", 0, 0); con2fb.console = 0x19; con2fb.framebuffer = 0; ioctl(fd, FBIOPUT_CON2FBMAP, &con2fb); param.type = 2; param.ts.xs = 0; param.ts.ys = 0; param.ts.xe = 0; param.ts.ye = 0; param.ts.sel_mode = 0; int fd1 = open("/dev/tty1", O_RDWR, 0); ioctl(fd1, TIOCLINUX, ¶m); con2fb.console = 1; con2fb.framebuffer = 0; ioctl(fd, FBIOPUT_CON2FBMAP, &con2fb); return 0; } After calling ioctl(fd1, TIOCLINUX, ¶m), the subsequent ioctl(fd, FBIOPUT_CON2FBMAP, &con2fb) causes the kernel to follow a different execution path: set_con2fb_map -> con2fb_init_display -> fbcon_set_disp -> redraw_screen -> hide_cursor -> clear_selection -> highlight -> invert_screen -> do_update_region -> fbcon_putcs -> ops->putcs Since ops->putcs is a NULL pointer, this leads to a kernel panic. To prevent this, we need to call set_blitting_type() within set_con2fb_map() to properly initialize ops->putcs. Reported-by: syzbot+3d613ae53c031502687a@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=3d613ae53c031502687a Tested-by: syzbot+3d613ae53c031502687a@syzkaller.appspotmail.com Signed-off-by: Qianqiang Liu Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fbcon.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 2e093535884b4..d9abae2516d84 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -861,6 +861,8 @@ static int set_con2fb_map(int unit, int newidx, int user) return err; fbcon_add_cursor_work(info); + } else if (vc) { + set_blitting_type(vc, info); } con2fb_map[unit] = newidx; From f1ebbe4cd07d058f42174cc5b8c5efcf83de8ffa Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 25 Sep 2024 21:12:36 +0200 Subject: [PATCH 287/352] fbdev: omapfb: Call of_node_put(ep) only once in omapdss_of_find_source_for_first_ep() An of_node_put(ep) call was immediately used after a pointer check for a of_graph_get_remote_port() call in this function implementation. Thus call such a function only once instead directly before the check. This issue was transformed by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Helge Deller --- drivers/video/fbdev/omap2/omapfb/dss/dss-of.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dss-of.c b/drivers/video/fbdev/omap2/omapfb/dss/dss-of.c index 4040e247e026e..d5a43b3bf45ec 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/dss-of.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/dss-of.c @@ -129,12 +129,9 @@ omapdss_of_find_source_for_first_ep(struct device_node *node) return ERR_PTR(-EINVAL); src_port = of_graph_get_remote_port(ep); - if (!src_port) { - of_node_put(ep); - return ERR_PTR(-EINVAL); - } - of_node_put(ep); + if (!src_port) + return ERR_PTR(-EINVAL); src = omap_dss_find_output_by_port_node(src_port); From 2555906fd53e0a5239431d44fad695b420e94fdd Mon Sep 17 00:00:00 2001 From: Qianqiang Liu Date: Thu, 26 Sep 2024 19:59:11 +0800 Subject: [PATCH 288/352] fbcon: break earlier in search_fb_in_map and search_for_mapped_con Break the for loop immediately upon finding the target, making the process more efficient. Signed-off-by: Qianqiang Liu Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fbcon.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index d9abae2516d84..e8b4e8c119b5c 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -512,8 +512,10 @@ static int search_fb_in_map(int idx) int i, retval = 0; for (i = first_fb_vc; i <= last_fb_vc; i++) { - if (con2fb_map[i] == idx) + if (con2fb_map[i] == idx) { retval = 1; + break; + } } return retval; } @@ -523,8 +525,10 @@ static int search_for_mapped_con(void) int i, retval = 0; for (i = first_fb_vc; i <= last_fb_vc; i++) { - if (con2fb_map[i] != -1) + if (con2fb_map[i] != -1) { retval = 1; + break; + } } return retval; } From d4fc4d01471528da8a9797a065982e05090e1d81 Mon Sep 17 00:00:00 2001 From: "Alexey Gladkov (Intel)" Date: Fri, 13 Sep 2024 19:05:56 +0200 Subject: [PATCH 289/352] x86/tdx: Fix "in-kernel MMIO" check TDX only supports kernel-initiated MMIO operations. The handle_mmio() function checks if the #VE exception occurred in the kernel and rejects the operation if it did not. However, userspace can deceive the kernel into performing MMIO on its behalf. For example, if userspace can point a syscall to an MMIO address, syscall does get_user() or put_user() on it, triggering MMIO #VE. The kernel will treat the #VE as in-kernel MMIO. Ensure that the target MMIO address is within the kernel before decoding instruction. Fixes: 31d58c4e557d ("x86/tdx: Handle in-kernel MMIO") Signed-off-by: Alexey Gladkov (Intel) Signed-off-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Acked-by: Dave Hansen Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/565a804b80387970460a4ebc67c88d1380f61ad1.1726237595.git.legion%40kernel.org --- arch/x86/coco/tdx/tdx.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index da8b66dce0da5..327c45c5013fe 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -16,6 +16,7 @@ #include #include #include +#include /* MMIO direction */ #define EPT_READ 0 @@ -433,6 +434,11 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) return -EINVAL; } + if (!fault_in_kernel_space(ve->gla)) { + WARN_ONCE(1, "Access to userspace address is not supported"); + return -EINVAL; + } + /* * Reject EPT violation #VEs that split pages. * From d1fb034b75a8a96fcb4bf01a7c0e1421eef833a3 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 23 Sep 2024 10:37:50 -0700 Subject: [PATCH 290/352] x86/cpu: Add two Intel CPU model numbers Pantherlake is a mobile CPU. Diamond Rapids next generation Xeon. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20240923173750.16874-1-tony.luck%40intel.com --- arch/x86/include/asm/intel-family.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index f81a851c46dca..17d899da46502 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -191,6 +191,8 @@ #define INTEL_FAM6_LUNARLAKE_M 0xBD #define INTEL_LUNARLAKE_M IFM(6, 0xBD) +#define INTEL_PANTHERLAKE_L IFM(6, 0xCC) + /* "Small Core" Processors (Atom/E-Core) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ @@ -257,4 +259,7 @@ #define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */ #define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */ +/* Family 19 */ +#define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */ + #endif /* _ASM_X86_INTEL_FAMILY_H */ From bfc4a245a794841cba5cf287034a0f60d3087402 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Sep 2024 08:35:24 +0200 Subject: [PATCH 291/352] dma-mapping: fix DMA API tracing for chained scatterlists scatterlist allocations can be chained, and thus all iterations need to use the chain-aware iterators. Switch the newly added tracing to use the proper iterators so that they work with chained scatterlists. Fixes: 038eb433dc14 ("dma-mapping: add tracing for dma-mapping API calls") Reported-by: syzbot+95e4ef83a3024384ec7a@syzkaller.appspotmail.com Signed-off-by: Christoph Hellwig Reviewed-by: Sean Anderson Tested-by: syzbot+95e4ef83a3024384ec7a@syzkaller.appspotmail.com --- include/trace/events/dma.h | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index f57f05331d738..569f86a44aaaf 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -176,9 +176,9 @@ TRACE_EVENT(dma_free, ); TRACE_EVENT(dma_map_sg, - TP_PROTO(struct device *dev, struct scatterlist *sg, int nents, + TP_PROTO(struct device *dev, struct scatterlist *sgl, int nents, int ents, enum dma_data_direction dir, unsigned long attrs), - TP_ARGS(dev, sg, nents, ents, dir, attrs), + TP_ARGS(dev, sgl, nents, ents, dir, attrs), TP_STRUCT__entry( __string(device, dev_name(dev)) @@ -190,17 +190,17 @@ TRACE_EVENT(dma_map_sg, ), TP_fast_assign( + struct scatterlist *sg; int i; __assign_str(device); - for (i = 0; i < nents; i++) - ((u64 *)__get_dynamic_array(phys_addrs))[i] = - sg_phys(sg + i); - for (i = 0; i < ents; i++) { + for_each_sg(sgl, sg, nents, i) + ((u64 *)__get_dynamic_array(phys_addrs))[i] = sg_phys(sg); + for_each_sg(sgl, sg, ents, i) { ((u64 *)__get_dynamic_array(dma_addrs))[i] = - sg_dma_address(sg + i); + sg_dma_address(sg); ((unsigned int *)__get_dynamic_array(lengths))[i] = - sg_dma_len(sg + i); + sg_dma_len(sg); } __entry->dir = dir; __entry->attrs = attrs; @@ -222,9 +222,9 @@ TRACE_EVENT(dma_map_sg, ); TRACE_EVENT(dma_unmap_sg, - TP_PROTO(struct device *dev, struct scatterlist *sg, int nents, + TP_PROTO(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs), - TP_ARGS(dev, sg, nents, dir, attrs), + TP_ARGS(dev, sgl, nents, dir, attrs), TP_STRUCT__entry( __string(device, dev_name(dev)) @@ -234,12 +234,12 @@ TRACE_EVENT(dma_unmap_sg, ), TP_fast_assign( + struct scatterlist *sg; int i; __assign_str(device); - for (i = 0; i < nents; i++) - ((u64 *)__get_dynamic_array(addrs))[i] = - sg_phys(sg + i); + for_each_sg(sgl, sg, nents, i) + ((u64 *)__get_dynamic_array(addrs))[i] = sg_phys(sg); __entry->dir = dir; __entry->attrs = attrs; ), @@ -290,9 +290,9 @@ DEFINE_EVENT(dma_sync_single, dma_sync_single_for_device, TP_ARGS(dev, dma_addr, size, dir)); DECLARE_EVENT_CLASS(dma_sync_sg, - TP_PROTO(struct device *dev, struct scatterlist *sg, int nents, + TP_PROTO(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir), - TP_ARGS(dev, sg, nents, dir), + TP_ARGS(dev, sgl, nents, dir), TP_STRUCT__entry( __string(device, dev_name(dev)) @@ -302,14 +302,15 @@ DECLARE_EVENT_CLASS(dma_sync_sg, ), TP_fast_assign( + struct scatterlist *sg; int i; __assign_str(device); - for (i = 0; i < nents; i++) { + for_each_sg(sgl, sg, nents, i) { ((u64 *)__get_dynamic_array(dma_addrs))[i] = - sg_dma_address(sg + i); + sg_dma_address(sg); ((unsigned int *)__get_dynamic_array(lengths))[i] = - sg_dma_len(sg + i); + sg_dma_len(sg); } __entry->dir = dir; ), From 3cb576bc6dfb8940228b8130638860b631dd428a Mon Sep 17 00:00:00 2001 From: Frank Min Date: Wed, 25 Sep 2024 11:39:06 +0800 Subject: [PATCH 292/352] drm/amdgpu: fix PTE copy corruption for sdma 7 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without setting dcc bit, there is ramdon PTE copy corruption on sdma 7. so add this bit and update the packet format accordingly. Acked-by: Alex Deucher Signed-off-by: Frank Min Reviewed-by: Christian König Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.11.x --- drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c index cfd8e183ad503..a8763496aed31 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c @@ -1080,13 +1080,16 @@ static void sdma_v7_0_vm_copy_pte(struct amdgpu_ib *ib, unsigned bytes = count * 8; ib->ptr[ib->length_dw++] = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_COPY) | - SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR); + SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR) | + SDMA_PKT_COPY_LINEAR_HEADER_CPV(1); + ib->ptr[ib->length_dw++] = bytes - 1; ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */ ib->ptr[ib->length_dw++] = lower_32_bits(src); ib->ptr[ib->length_dw++] = upper_32_bits(src); ib->ptr[ib->length_dw++] = lower_32_bits(pe); ib->ptr[ib->length_dw++] = upper_32_bits(pe); + ib->ptr[ib->length_dw++] = 0; } @@ -1744,7 +1747,7 @@ static void sdma_v7_0_set_buffer_funcs(struct amdgpu_device *adev) } static const struct amdgpu_vm_pte_funcs sdma_v7_0_vm_pte_funcs = { - .copy_pte_num_dw = 7, + .copy_pte_num_dw = 8, .copy_pte = sdma_v7_0_vm_copy_pte, .write_pte = sdma_v7_0_vm_write_pte, .set_pte_pde = sdma_v7_0_vm_set_pte_pde, From a8387ddc0d15a365dd04baaa325a863d3612e020 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 25 Sep 2024 14:17:53 -0400 Subject: [PATCH 293/352] drm/amdgpu: fix vbios fetching for SR-IOV SR-IOV fetches the vbios from VRAM in some cases. Re-enable the VRAM path for dGPUs and rename the function to make it clear that it is not IGP specific. Fixes: 042658d17a54 ("drm/amdgpu: clean up vbios fetching code") Reviewed-by: Yang Wang Tested-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c index 46bf623919d7c..45affc02548c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c @@ -87,8 +87,9 @@ static bool check_atom_bios(uint8_t *bios, size_t size) * part of the system bios. On boot, the system bios puts a * copy of the igp rom at the start of vram if a discrete card is * present. + * For SR-IOV, the vbios image is also put in VRAM in the VF. */ -static bool igp_read_bios_from_vram(struct amdgpu_device *adev) +static bool amdgpu_read_bios_from_vram(struct amdgpu_device *adev) { uint8_t __iomem *bios; resource_size_t vram_base; @@ -414,7 +415,7 @@ static bool amdgpu_get_bios_apu(struct amdgpu_device *adev) goto success; } - if (igp_read_bios_from_vram(adev)) { + if (amdgpu_read_bios_from_vram(adev)) { dev_info(adev->dev, "Fetched VBIOS from VRAM BAR\n"); goto success; } @@ -448,6 +449,12 @@ static bool amdgpu_get_bios_dgpu(struct amdgpu_device *adev) goto success; } + /* this is required for SR-IOV */ + if (amdgpu_read_bios_from_vram(adev)) { + dev_info(adev->dev, "Fetched VBIOS from VRAM BAR\n"); + goto success; + } + if (amdgpu_read_platform_bios(adev)) { dev_info(adev->dev, "Fetched VBIOS from platform\n"); goto success; From 34ad56a467c320d07db22146cfb99ee01704a5de Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 6 Sep 2024 13:51:06 -0400 Subject: [PATCH 294/352] drm/amdgpu: bump driver version for cleared VRAM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Driver now clears VRAM on allocation. Bump the driver version so mesa knows when it will get cleared vram by default. Reviewed-by: Marek Olšák Reviewed-by: Rajneesh Bhardwaj Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.11.x --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 86cff30d5c4e9..db0763ffeff7d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -117,9 +117,10 @@ * - 3.56.0 - Update IB start address and size alignment for decode and encode * - 3.57.0 - Compute tunneling on GFX10+ * - 3.58.0 - Add GFX12 DCC support + * - 3.59.0 - Cleared VRAM */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 58 +#define KMS_DRIVER_MINOR 59 #define KMS_DRIVER_PATCHLEVEL 0 /* From d7d2688bf4ea58734d73e18edcbf4684b1496d30 Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Fri, 20 Sep 2024 19:05:37 +0800 Subject: [PATCH 295/352] drm/amd/pm: update workload mask after the setting update workload mask after the setting. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3625 Signed-off-by: Kenneth Feng Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 6 +++++- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 3 +++ drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 6 +++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index a887ab945dfa2..1d024b122b0c0 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -2569,10 +2569,14 @@ static int smu_v13_0_0_set_power_profile_mode(struct smu_context *smu, } } - return smu_cmn_send_smc_msg_with_param(smu, + ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetWorkloadMask, workload_mask, NULL); + if (!ret) + smu->workload_mask = workload_mask; + + return ret; } static bool smu_v13_0_0_is_mode1_reset_supported(struct smu_context *smu) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c index 7bc95c4043778..b891a5e0a3969 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c @@ -2501,8 +2501,11 @@ static int smu_v13_0_7_set_power_profile_mode(struct smu_context *smu, long *inp return -EINVAL; ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetWorkloadMask, 1 << workload_type, NULL); + if (ret) dev_err(smu->adev->dev, "[%s] Failed to set work load mask!", __func__); + else + smu->workload_mask = (1 << workload_type); return ret; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c index 43820d7d2c54a..5899d01fa73d3 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c @@ -1861,10 +1861,14 @@ static int smu_v14_0_2_set_power_profile_mode(struct smu_context *smu, if (workload_type < 0) return -EINVAL; - return smu_cmn_send_smc_msg_with_param(smu, + ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetWorkloadMask, 1 << workload_type, NULL); + if (!ret) + smu->workload_mask = 1 << workload_type; + + return ret; } static int smu_v14_0_2_baco_enter(struct smu_context *smu) From df9b455633aee0bad3e5c3dc9fc1c860b13c96d2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 26 Sep 2024 14:58:30 +0100 Subject: [PATCH 296/352] netfs: Fix write oops in generic/346 (9p) and generic/074 (cifs) In netfslib, a buffered writeback operation has a 'write queue' of folios that are being written, held in a linear sequence of folio_queue structs. The 'issuer' adds new folio_queues on the leading edge of the queue and populates each one progressively; the 'collector' pops them off the trailing edge and discards them and the folios they point to as they are consumed. The queue is required to always retain at least one folio_queue structure. This allows the queue to be accessed without locking and with just a bit of barriering. When a new subrequest is prepared, its ->io_iter iterator is pointed at the current end of the write queue and then the iterator is extended as more data is added to the queue until the subrequest is committed. Now, the problem is that the folio_queue at the leading edge of the write queue when a subrequest is prepared might have been entirely consumed - but not yet removed from the queue as it is the only remaining one and is preventing the queue from collapsing. So, what happens is that subreq->io_iter is pointed at the spent folio_queue, then a new folio_queue is added, and, at that point, the collector is at entirely at liberty to immediately delete the spent folio_queue. This leaves the subreq->io_iter pointing at a freed object. If the system is lucky, iterate_folioq() sees ->io_iter, sees the as-yet uncorrupted freed object and advances to the next folio_queue in the queue. In the case seen, however, the freed object gets recycled and put back onto the queue at the tail and filled to the end. This confuses iterate_folioq() and it tries to step ->next, which may be NULL - resulting in an oops. Fix this by the following means: (1) When preparing a write subrequest, make sure there's a folio_queue struct with space in it at the leading edge of the queue. A function to make space is split out of the function to append a folio so that it can be called for this purpose. (2) If the request struct iterator is pointing to a completely spent folio_queue when we make space, then advance the iterator to the newly allocated folio_queue. The subrequest's iterator will then be set from this. The oops could be triggered using the generic/346 xfstest with a filesystem on9P over TCP with cache=loose. The oops looked something like: BUG: kernel NULL pointer dereference, address: 0000000000000008 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page ... RIP: 0010:_copy_from_iter+0x2db/0x530 ... Call Trace: ... p9pdu_vwritef+0x3d8/0x5d0 p9_client_prepare_req+0xa8/0x140 p9_client_rpc+0x81/0x280 p9_client_write+0xcf/0x1c0 v9fs_issue_write+0x87/0xc0 netfs_advance_write+0xa0/0xb0 netfs_write_folio.isra.0+0x42d/0x500 netfs_writepages+0x15a/0x1f0 do_writepages+0xd1/0x220 filemap_fdatawrite_wbc+0x5c/0x80 v9fs_mmap_vm_close+0x7d/0xb0 remove_vma+0x35/0x70 vms_complete_munmap_vmas+0x11a/0x170 do_vmi_align_munmap+0x17d/0x1c0 do_vmi_munmap+0x13e/0x150 __vm_munmap+0x92/0xd0 __x64_sys_munmap+0x17/0x20 do_syscall_64+0x80/0xe0 entry_SYSCALL_64_after_hwframe+0x71/0x79 This also fixed a similar-looking issue with cifs and generic/074. Fixes: cd0277ed0c18 ("netfs: Use new folio_queue data type and iterator instead of xarray iter") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202409180928.f20b5a08-oliver.sang@intel.com Closes: https://lore.kernel.org/oe-lkp/202409131438.3f225fbf-oliver.sang@intel.com Signed-off-by: David Howells Tested-by: kernel test robot cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Dominique Martinet cc: Christian Schoenebeck cc: Paulo Alcantara cc: Jeff Layton cc: v9fs@lists.linux.dev cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/netfs/internal.h | 1 + fs/netfs/misc.c | 74 ++++++++++++++++++++++++++++++------------ fs/netfs/write_issue.c | 12 ++++++- 3 files changed, 65 insertions(+), 22 deletions(-) diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index c9f0ed24cb7bd..c562aec3b483f 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -58,6 +58,7 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {} /* * misc.c */ +struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq); int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio, bool needs_put); struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq); diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 0ad0982ce0e20..63280791de3b3 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -9,34 +9,66 @@ #include "internal.h" /* - * Append a folio to the rolling queue. + * Make sure there's space in the rolling queue. */ -int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio, - bool needs_put) +struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq) { - struct folio_queue *tail = rreq->buffer_tail; - unsigned int slot, order = folio_order(folio); + struct folio_queue *tail = rreq->buffer_tail, *prev; + unsigned int prev_nr_slots = 0; if (WARN_ON_ONCE(!rreq->buffer && tail) || WARN_ON_ONCE(rreq->buffer && !tail)) - return -EIO; - - if (!tail || folioq_full(tail)) { - tail = kmalloc(sizeof(*tail), GFP_NOFS); - if (!tail) - return -ENOMEM; - netfs_stat(&netfs_n_folioq); - folioq_init(tail); - tail->prev = rreq->buffer_tail; - if (tail->prev) - tail->prev->next = tail; - rreq->buffer_tail = tail; - if (!rreq->buffer) { - rreq->buffer = tail; - iov_iter_folio_queue(&rreq->io_iter, ITER_SOURCE, tail, 0, 0, 0); + return ERR_PTR(-EIO); + + prev = tail; + if (prev) { + if (!folioq_full(tail)) + return tail; + prev_nr_slots = folioq_nr_slots(tail); + } + + tail = kmalloc(sizeof(*tail), GFP_NOFS); + if (!tail) + return ERR_PTR(-ENOMEM); + netfs_stat(&netfs_n_folioq); + folioq_init(tail); + tail->prev = prev; + if (prev) + /* [!] NOTE: After we set prev->next, the consumer is entirely + * at liberty to delete prev. + */ + WRITE_ONCE(prev->next, tail); + + rreq->buffer_tail = tail; + if (!rreq->buffer) { + rreq->buffer = tail; + iov_iter_folio_queue(&rreq->io_iter, ITER_SOURCE, tail, 0, 0, 0); + } else { + /* Make sure we don't leave the master iterator pointing to a + * block that might get immediately consumed. + */ + if (rreq->io_iter.folioq == prev && + rreq->io_iter.folioq_slot == prev_nr_slots) { + rreq->io_iter.folioq = tail; + rreq->io_iter.folioq_slot = 0; } - rreq->buffer_tail_slot = 0; } + rreq->buffer_tail_slot = 0; + return tail; +} + +/* + * Append a folio to the rolling queue. + */ +int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio, + bool needs_put) +{ + struct folio_queue *tail; + unsigned int slot, order = folio_order(folio); + + tail = netfs_buffer_make_space(rreq); + if (IS_ERR(tail)) + return PTR_ERR(tail); rreq->io_iter.count += PAGE_SIZE << order; diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 04e66d587f777..0929d9fd4ce7c 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -153,12 +153,22 @@ static void netfs_prepare_write(struct netfs_io_request *wreq, loff_t start) { struct netfs_io_subrequest *subreq; + struct iov_iter *wreq_iter = &wreq->io_iter; + + /* Make sure we don't point the iterator at a used-up folio_queue + * struct being used as a placeholder to prevent the queue from + * collapsing. In such a case, extend the queue. + */ + if (iov_iter_is_folioq(wreq_iter) && + wreq_iter->folioq_slot >= folioq_nr_slots(wreq_iter->folioq)) { + netfs_buffer_make_space(wreq); + } subreq = netfs_alloc_subrequest(wreq); subreq->source = stream->source; subreq->start = start; subreq->stream_nr = stream->stream_nr; - subreq->io_iter = wreq->io_iter; + subreq->io_iter = *wreq_iter; _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index); From b0abcd65ec545701b8793e12bc27dc98042b151a Mon Sep 17 00:00:00 2001 From: Enzo Matsumiya Date: Thu, 26 Sep 2024 14:46:13 -0300 Subject: [PATCH 297/352] smb: client: fix UAF in async decryption Doing an async decryption (large read) crashes with a slab-use-after-free way down in the crypto API. Reproducer: # mount.cifs -o ...,seal,esize=1 //srv/share /mnt # dd if=/mnt/largefile of=/dev/null ... [ 194.196391] ================================================================== [ 194.196844] BUG: KASAN: slab-use-after-free in gf128mul_4k_lle+0xc1/0x110 [ 194.197269] Read of size 8 at addr ffff888112bd0448 by task kworker/u77:2/899 [ 194.197707] [ 194.197818] CPU: 12 UID: 0 PID: 899 Comm: kworker/u77:2 Not tainted 6.11.0-lku-00028-gfca3ca14a17a-dirty #43 [ 194.198400] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-3-gd478f380-prebuilt.qemu.org 04/01/2014 [ 194.199046] Workqueue: smb3decryptd smb2_decrypt_offload [cifs] [ 194.200032] Call Trace: [ 194.200191] [ 194.200327] dump_stack_lvl+0x4e/0x70 [ 194.200558] ? gf128mul_4k_lle+0xc1/0x110 [ 194.200809] print_report+0x174/0x505 [ 194.201040] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 194.201352] ? srso_return_thunk+0x5/0x5f [ 194.201604] ? __virt_addr_valid+0xdf/0x1c0 [ 194.201868] ? gf128mul_4k_lle+0xc1/0x110 [ 194.202128] kasan_report+0xc8/0x150 [ 194.202361] ? gf128mul_4k_lle+0xc1/0x110 [ 194.202616] gf128mul_4k_lle+0xc1/0x110 [ 194.202863] ghash_update+0x184/0x210 [ 194.203103] shash_ahash_update+0x184/0x2a0 [ 194.203377] ? __pfx_shash_ahash_update+0x10/0x10 [ 194.203651] ? srso_return_thunk+0x5/0x5f [ 194.203877] ? crypto_gcm_init_common+0x1ba/0x340 [ 194.204142] gcm_hash_assoc_remain_continue+0x10a/0x140 [ 194.204434] crypt_message+0xec1/0x10a0 [cifs] [ 194.206489] ? __pfx_crypt_message+0x10/0x10 [cifs] [ 194.208507] ? srso_return_thunk+0x5/0x5f [ 194.209205] ? srso_return_thunk+0x5/0x5f [ 194.209925] ? srso_return_thunk+0x5/0x5f [ 194.210443] ? srso_return_thunk+0x5/0x5f [ 194.211037] decrypt_raw_data+0x15f/0x250 [cifs] [ 194.212906] ? __pfx_decrypt_raw_data+0x10/0x10 [cifs] [ 194.214670] ? srso_return_thunk+0x5/0x5f [ 194.215193] smb2_decrypt_offload+0x12a/0x6c0 [cifs] This is because TFM is being used in parallel. Fix this by allocating a new AEAD TFM for async decryption, but keep the existing one for synchronous READ cases (similar to what is done in smb3_calc_signature()). Also remove the calls to aead_request_set_callback() and crypto_wait_req() since it's always going to be a synchronous operation. Signed-off-by: Enzo Matsumiya Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 47 ++++++++++++++++++++++++----------------- fs/smb/client/smb2pdu.c | 6 ++++++ 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 1ee2dd4a1cae0..177173072bfa9 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4309,7 +4309,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) */ static int crypt_message(struct TCP_Server_Info *server, int num_rqst, - struct smb_rqst *rqst, int enc) + struct smb_rqst *rqst, int enc, struct crypto_aead *tfm) { struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)rqst[0].rq_iov[0].iov_base; @@ -4320,8 +4320,6 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, u8 key[SMB3_ENC_DEC_KEY_SIZE]; struct aead_request *req; u8 *iv; - DECLARE_CRYPTO_WAIT(wait); - struct crypto_aead *tfm; unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); void *creq; size_t sensitive_size; @@ -4333,14 +4331,6 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, return rc; } - rc = smb3_crypto_aead_allocate(server); - if (rc) { - cifs_server_dbg(VFS, "%s: crypto alloc failed\n", __func__); - return rc; - } - - tfm = enc ? server->secmech.enc : server->secmech.dec; - if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) rc = crypto_aead_setkey(tfm, key, SMB3_GCM256_CRYPTKEY_SIZE); @@ -4380,11 +4370,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, aead_request_set_crypt(req, sg, sg, crypt_len, iv); aead_request_set_ad(req, assoc_data_len); - aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &wait); - - rc = crypto_wait_req(enc ? crypto_aead_encrypt(req) - : crypto_aead_decrypt(req), &wait); + rc = enc ? crypto_aead_encrypt(req) : crypto_aead_decrypt(req); if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); @@ -4526,7 +4512,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, /* fill the 1st iov with a transform header */ fill_transform_hdr(tr_hdr, orig_len, old_rq, server->cipher_type); - rc = crypt_message(server, num_rqst, new_rq, 1); + rc = crypt_message(server, num_rqst, new_rq, 1, server->secmech.enc); cifs_dbg(FYI, "Encrypt message returned %d\n", rc); if (rc) goto err_free; @@ -4551,8 +4537,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, unsigned int buf_data_size, struct iov_iter *iter, bool is_offloaded) { - struct kvec iov[2]; + struct crypto_aead *tfm; struct smb_rqst rqst = {NULL}; + struct kvec iov[2]; size_t iter_size = 0; int rc; @@ -4568,9 +4555,31 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, iter_size = iov_iter_count(iter); } - rc = crypt_message(server, 1, &rqst, 0); + if (is_offloaded) { + if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) + tfm = crypto_alloc_aead("gcm(aes)", 0, 0); + else + tfm = crypto_alloc_aead("ccm(aes)", 0, 0); + if (IS_ERR(tfm)) { + rc = PTR_ERR(tfm); + cifs_server_dbg(VFS, "%s: Failed alloc decrypt TFM, rc=%d\n", __func__, rc); + + return rc; + } + } else { + if (unlikely(!server->secmech.dec)) + return -EIO; + + tfm = server->secmech.dec; + } + + rc = crypt_message(server, 1, &rqst, 0, tfm); cifs_dbg(FYI, "Decrypt message returned %d\n", rc); + if (is_offloaded) + crypto_free_aead(tfm); + if (rc) return rc; diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index bb225758448a6..cb423bfe7377a 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1266,6 +1266,12 @@ SMB2_negotiate(const unsigned int xid, else cifs_server_dbg(VFS, "Missing expected negotiate contexts\n"); } + + if (server->cipher_type && !rc) { + rc = smb3_crypto_aead_allocate(server); + if (rc) + cifs_server_dbg(VFS, "%s: crypto alloc failed, rc=%d\n", __func__, rc); + } neg_exit: free_rsp_buf(resp_buftype, rsp); return rc; From f7025d861694362348efc14eaad6a17840c4e9a4 Mon Sep 17 00:00:00 2001 From: Enzo Matsumiya Date: Thu, 26 Sep 2024 14:46:14 -0300 Subject: [PATCH 298/352] smb: client: allocate crypto only for primary server For extra channels, point ->secmech.{enc,dec} to the primary server ones. Signed-off-by: Enzo Matsumiya Signed-off-by: Steve French --- fs/smb/client/cifsencrypt.c | 17 +++++++++++------ fs/smb/client/smb2pdu.c | 10 +++++++--- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index 7481b21a04898..15aa75e7f1c3d 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -735,13 +735,18 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server) cifs_free_hash(&server->secmech.sha512); cifs_free_hash(&server->secmech.hmacmd5); - if (server->secmech.enc) { - crypto_free_aead(server->secmech.enc); - server->secmech.enc = NULL; - } + if (!SERVER_IS_CHAN(server)) { + if (server->secmech.enc) { + crypto_free_aead(server->secmech.enc); + server->secmech.enc = NULL; + } - if (server->secmech.dec) { - crypto_free_aead(server->secmech.dec); + if (server->secmech.dec) { + crypto_free_aead(server->secmech.dec); + server->secmech.dec = NULL; + } + } else { + server->secmech.enc = NULL; server->secmech.dec = NULL; } } diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index cb423bfe7377a..02828b9c3cb32 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1268,9 +1268,13 @@ SMB2_negotiate(const unsigned int xid, } if (server->cipher_type && !rc) { - rc = smb3_crypto_aead_allocate(server); - if (rc) - cifs_server_dbg(VFS, "%s: crypto alloc failed, rc=%d\n", __func__, rc); + if (!SERVER_IS_CHAN(server)) { + rc = smb3_crypto_aead_allocate(server); + } else { + /* For channels, just reuse the primary server crypto secmech. */ + server->secmech.enc = server->primary_server->secmech.enc; + server->secmech.dec = server->primary_server->secmech.dec; + } } neg_exit: free_rsp_buf(resp_buftype, rsp); From a13ca780afab350f37f8be9eda2bf79d1aed9bdd Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 18 Sep 2024 02:04:01 -0300 Subject: [PATCH 299/352] smb: client: stop flooding dmesg in smb2_calc_signature() When having several mounts that share same credential and the client couldn't re-establish an SMB session due to an expired kerberos ticket or rotated password, smb2_calc_signature() will end up flooding dmesg when not finding SMB sessions to calculate signatures. Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/smb2transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index e4636fca821d9..c8bf0000f73bd 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -242,7 +242,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId)); if (unlikely(!ses)) { - cifs_server_dbg(VFS, "%s: Could not find session\n", __func__); + cifs_server_dbg(FYI, "%s: Could not find session\n", __func__); return -ENOENT; } From db44ca9f7bc00a368d345b9fa1ecee0c4e75ac48 Mon Sep 17 00:00:00 2001 From: Enzo Matsumiya Date: Thu, 26 Sep 2024 14:46:15 -0300 Subject: [PATCH 300/352] smb: client: make HMAC-MD5 TFM ephemeral The HMAC-MD5 shash TFM is used only briefly during Session Setup stage, when computing NTLMv2 hashes. There's no need to keep it allocated in servers' secmech the whole time, so keep its lifetime inside setup_ntlmv2_rsp(). Signed-off-by: Enzo Matsumiya Signed-off-by: Steve French --- fs/smb/client/cifsencrypt.c | 133 ++++++++++++++---------------------- fs/smb/client/cifsglob.h | 1 - 2 files changed, 50 insertions(+), 84 deletions(-) diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index 15aa75e7f1c3d..464e6ccdfa5f8 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -416,7 +416,7 @@ find_timestamp(struct cifs_ses *ses) } static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, - const struct nls_table *nls_cp) + const struct nls_table *nls_cp, struct shash_desc *hmacmd5) { int rc = 0; int len; @@ -425,34 +425,26 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, wchar_t *domain; wchar_t *server; - if (!ses->server->secmech.hmacmd5) { - cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); - return -1; - } - /* calculate md4 hash of password */ E_md4hash(ses->password, nt_hash, nls_cp); - rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, nt_hash, - CIFS_NTHASH_SIZE); + rc = crypto_shash_setkey(hmacmd5->tfm, nt_hash, CIFS_NTHASH_SIZE); if (rc) { - cifs_dbg(VFS, "%s: Could not set NT Hash as a key\n", __func__); + cifs_dbg(VFS, "%s: Could not set NT hash as a key, rc=%d\n", __func__, rc); return rc; } - rc = crypto_shash_init(ses->server->secmech.hmacmd5); + rc = crypto_shash_init(hmacmd5); if (rc) { - cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); + cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc); return rc; } /* convert ses->user_name to unicode */ len = ses->user_name ? strlen(ses->user_name) : 0; user = kmalloc(2 + (len * 2), GFP_KERNEL); - if (user == NULL) { - rc = -ENOMEM; - return rc; - } + if (user == NULL) + return -ENOMEM; if (len) { len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp); @@ -461,11 +453,10 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, *(u16 *)user = 0; } - rc = crypto_shash_update(ses->server->secmech.hmacmd5, - (char *)user, 2 * len); + rc = crypto_shash_update(hmacmd5, (char *)user, 2 * len); kfree(user); if (rc) { - cifs_dbg(VFS, "%s: Could not update with user\n", __func__); + cifs_dbg(VFS, "%s: Could not update with user, rc=%d\n", __func__, rc); return rc; } @@ -474,19 +465,15 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = strlen(ses->domainName); domain = kmalloc(2 + (len * 2), GFP_KERNEL); - if (domain == NULL) { - rc = -ENOMEM; - return rc; - } + if (domain == NULL) + return -ENOMEM; + len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len, nls_cp); - rc = - crypto_shash_update(ses->server->secmech.hmacmd5, - (char *)domain, 2 * len); + rc = crypto_shash_update(hmacmd5, (char *)domain, 2 * len); kfree(domain); if (rc) { - cifs_dbg(VFS, "%s: Could not update with domain\n", - __func__); + cifs_dbg(VFS, "%s: Could not update with domain, rc=%d\n", __func__, rc); return rc; } } else { @@ -494,33 +481,27 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = strlen(ses->ip_addr); server = kmalloc(2 + (len * 2), GFP_KERNEL); - if (server == NULL) { - rc = -ENOMEM; - return rc; - } - len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len, - nls_cp); - rc = - crypto_shash_update(ses->server->secmech.hmacmd5, - (char *)server, 2 * len); + if (server == NULL) + return -ENOMEM; + + len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len, nls_cp); + rc = crypto_shash_update(hmacmd5, (char *)server, 2 * len); kfree(server); if (rc) { - cifs_dbg(VFS, "%s: Could not update with server\n", - __func__); + cifs_dbg(VFS, "%s: Could not update with server, rc=%d\n", __func__, rc); return rc; } } - rc = crypto_shash_final(ses->server->secmech.hmacmd5, - ntlmv2_hash); + rc = crypto_shash_final(hmacmd5, ntlmv2_hash); if (rc) - cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); + cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc); return rc; } static int -CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) +CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash, struct shash_desc *hmacmd5) { int rc; struct ntlmv2_resp *ntlmv2 = (struct ntlmv2_resp *) @@ -531,43 +512,33 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE + offsetof(struct ntlmv2_resp, challenge.key[0])); - if (!ses->server->secmech.hmacmd5) { - cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); - return -1; - } - - rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, - ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + rc = crypto_shash_setkey(hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { - cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", - __func__); + cifs_dbg(VFS, "%s: Could not set NTLMv2 hash as a key, rc=%d\n", __func__, rc); return rc; } - rc = crypto_shash_init(ses->server->secmech.hmacmd5); + rc = crypto_shash_init(hmacmd5); if (rc) { - cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); + cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc); return rc; } if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) - memcpy(ntlmv2->challenge.key, - ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); + memcpy(ntlmv2->challenge.key, ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); else - memcpy(ntlmv2->challenge.key, - ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); - rc = crypto_shash_update(ses->server->secmech.hmacmd5, - ntlmv2->challenge.key, hash_len); + memcpy(ntlmv2->challenge.key, ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); + + rc = crypto_shash_update(hmacmd5, ntlmv2->challenge.key, hash_len); if (rc) { - cifs_dbg(VFS, "%s: Could not update with response\n", __func__); + cifs_dbg(VFS, "%s: Could not update with response, rc=%d\n", __func__, rc); return rc; } /* Note that the MD5 digest over writes anon.challenge_key.key */ - rc = crypto_shash_final(ses->server->secmech.hmacmd5, - ntlmv2->ntlmv2_hash); + rc = crypto_shash_final(hmacmd5, ntlmv2->ntlmv2_hash); if (rc) - cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); + cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc); return rc; } @@ -575,6 +546,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) int setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) { + struct shash_desc *hmacmd5 = NULL; int rc; int baselen; unsigned int tilen; @@ -640,55 +612,51 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) cifs_server_lock(ses->server); - rc = cifs_alloc_hash("hmac(md5)", &ses->server->secmech.hmacmd5); + rc = cifs_alloc_hash("hmac(md5)", &hmacmd5); if (rc) { + cifs_dbg(VFS, "Could not allocate HMAC-MD5, rc=%d\n", rc); goto unlock; } /* calculate ntlmv2_hash */ - rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp); + rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp, hmacmd5); if (rc) { - cifs_dbg(VFS, "Could not get v2 hash rc %d\n", rc); + cifs_dbg(VFS, "Could not get NTLMv2 hash, rc=%d\n", rc); goto unlock; } /* calculate first part of the client response (CR1) */ - rc = CalcNTLMv2_response(ses, ntlmv2_hash); + rc = CalcNTLMv2_response(ses, ntlmv2_hash, hmacmd5); if (rc) { - cifs_dbg(VFS, "Could not calculate CR1 rc: %d\n", rc); + cifs_dbg(VFS, "Could not calculate CR1, rc=%d\n", rc); goto unlock; } /* now calculate the session key for NTLMv2 */ - rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, - ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + rc = crypto_shash_setkey(hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { - cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", - __func__); + cifs_dbg(VFS, "%s: Could not set NTLMv2 hash as a key, rc=%d\n", __func__, rc); goto unlock; } - rc = crypto_shash_init(ses->server->secmech.hmacmd5); + rc = crypto_shash_init(hmacmd5); if (rc) { - cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); + cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc); goto unlock; } - rc = crypto_shash_update(ses->server->secmech.hmacmd5, - ntlmv2->ntlmv2_hash, - CIFS_HMAC_MD5_HASH_SIZE); + rc = crypto_shash_update(hmacmd5, ntlmv2->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { - cifs_dbg(VFS, "%s: Could not update with response\n", __func__); + cifs_dbg(VFS, "%s: Could not update with response, rc=%d\n", __func__, rc); goto unlock; } - rc = crypto_shash_final(ses->server->secmech.hmacmd5, - ses->auth_key.response); + rc = crypto_shash_final(hmacmd5, ses->auth_key.response); if (rc) - cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); - + cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc); unlock: cifs_server_unlock(ses->server); + cifs_free_hash(&hmacmd5); setup_ntlmv2_rsp_ret: kfree_sensitive(tiblob); @@ -733,7 +701,6 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server) cifs_free_hash(&server->secmech.hmacsha256); cifs_free_hash(&server->secmech.md5); cifs_free_hash(&server->secmech.sha512); - cifs_free_hash(&server->secmech.hmacmd5); if (!SERVER_IS_CHAN(server)) { if (server->secmech.enc) { diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 15571cf0ba632..da35c160e7dd4 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -178,7 +178,6 @@ struct session_key { /* crypto hashing related structure/fields, not specific to a sec mech */ struct cifs_secmech { - struct shash_desc *hmacmd5; /* hmacmd5 hash function, for NTLMv2/CR1 hashes */ struct shash_desc *md5; /* md5 hash function, for CIFS/SMB1 signatures */ struct shash_desc *hmacsha256; /* hmac-sha256 hash function, for SMB2 signatures */ struct shash_desc *sha512; /* sha512 hash function, for SMB3.1.1 preauth hash */ From 220d83b52c7d16ec3c168b82f4e6ce59c645f7ab Mon Sep 17 00:00:00 2001 From: Enzo Matsumiya Date: Thu, 26 Sep 2024 14:46:16 -0300 Subject: [PATCH 301/352] smb: client: make SHA-512 TFM ephemeral The SHA-512 shash TFM is used only briefly during Session Setup stage, when computing SMB 3.1.1 preauth hash. There's no need to keep it allocated in servers' secmech the whole time, so keep its lifetime inside smb311_update_preauth_hash(). This also makes smb311_crypto_shash_allocate() redundant, so expose smb3_crypto_shash_allocate() and use that. Signed-off-by: Enzo Matsumiya Signed-off-by: Steve French --- fs/smb/client/cifsencrypt.c | 1 - fs/smb/client/cifsglob.h | 1 - fs/smb/client/sess.c | 2 +- fs/smb/client/smb2misc.c | 28 ++++++++++++++-------------- fs/smb/client/smb2proto.h | 2 +- fs/smb/client/smb2transport.c | 30 +----------------------------- 6 files changed, 17 insertions(+), 47 deletions(-) diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index 464e6ccdfa5f8..2d851f596a727 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -700,7 +700,6 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server) cifs_free_hash(&server->secmech.aes_cmac); cifs_free_hash(&server->secmech.hmacsha256); cifs_free_hash(&server->secmech.md5); - cifs_free_hash(&server->secmech.sha512); if (!SERVER_IS_CHAN(server)) { if (server->secmech.enc) { diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index da35c160e7dd4..315aac5dec051 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -180,7 +180,6 @@ struct session_key { struct cifs_secmech { struct shash_desc *md5; /* md5 hash function, for CIFS/SMB1 signatures */ struct shash_desc *hmacsha256; /* hmac-sha256 hash function, for SMB2 signatures */ - struct shash_desc *sha512; /* sha512 hash function, for SMB3.1.1 preauth hash */ struct shash_desc *aes_cmac; /* block-cipher based MAC function, for SMB3 signatures */ struct crypto_aead *enc; /* smb3 encryption AEAD TFM (AES-CCM and AES-GCM) */ diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 3216f786908fb..03c0b484a4b5a 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -624,7 +624,7 @@ cifs_ses_add_channel(struct cifs_ses *ses, * to sign packets before we generate the channel signing key * (we sign with the session key) */ - rc = smb311_crypto_shash_allocate(chan->server); + rc = smb3_crypto_shash_allocate(chan->server); if (rc) { cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__); mutex_unlock(&ses->session_mutex); diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index f3c4b70b77b94..bdeb12ff53e3c 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -906,41 +906,41 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server, || (hdr->Status != cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED)))) return 0; - ok: - rc = smb311_crypto_shash_allocate(server); - if (rc) + rc = cifs_alloc_hash("sha512", &sha512); + if (rc) { + cifs_dbg(VFS, "%s: Could not allocate SHA512 shash, rc=%d\n", __func__, rc); return rc; + } - sha512 = server->secmech.sha512; rc = crypto_shash_init(sha512); if (rc) { - cifs_dbg(VFS, "%s: Could not init sha512 shash\n", __func__); - return rc; + cifs_dbg(VFS, "%s: Could not init SHA512 shash, rc=%d\n", __func__, rc); + goto err_free; } rc = crypto_shash_update(sha512, ses->preauth_sha_hash, SMB2_PREAUTH_HASH_SIZE); if (rc) { - cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__); - return rc; + cifs_dbg(VFS, "%s: Could not update SHA512 shash, rc=%d\n", __func__, rc); + goto err_free; } for (i = 0; i < nvec; i++) { rc = crypto_shash_update(sha512, iov[i].iov_base, iov[i].iov_len); if (rc) { - cifs_dbg(VFS, "%s: Could not update sha512 shash\n", - __func__); - return rc; + cifs_dbg(VFS, "%s: Could not update SHA512 shash, rc=%d\n", __func__, rc); + goto err_free; } } rc = crypto_shash_final(sha512, ses->preauth_sha_hash); if (rc) { - cifs_dbg(VFS, "%s: Could not finalize sha512 shash\n", - __func__); - return rc; + cifs_dbg(VFS, "%s: Could not finalize SHA12 shash, rc=%d\n", __func__, rc); + goto err_free; } +err_free: + cifs_free_hash(&sha512); return 0; } diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index c7e1b149877a0..56a896ff7cd9f 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -291,7 +291,7 @@ extern int smb2_validate_and_copy_iov(unsigned int offset, extern void smb2_copy_fs_info_to_kstatfs( struct smb2_fs_full_size_info *pfs_inf, struct kstatfs *kst); -extern int smb311_crypto_shash_allocate(struct TCP_Server_Info *server); +extern int smb3_crypto_shash_allocate(struct TCP_Server_Info *server); extern int smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server, struct kvec *iov, int nvec); diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index c8bf0000f73bd..f7e04c40d22e0 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -26,8 +26,7 @@ #include "../common/smb2status.h" #include "smb2glob.h" -static int -smb3_crypto_shash_allocate(struct TCP_Server_Info *server) +int smb3_crypto_shash_allocate(struct TCP_Server_Info *server) { struct cifs_secmech *p = &server->secmech; int rc; @@ -46,33 +45,6 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server) return rc; } -int -smb311_crypto_shash_allocate(struct TCP_Server_Info *server) -{ - struct cifs_secmech *p = &server->secmech; - int rc = 0; - - rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256); - if (rc) - return rc; - - rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac); - if (rc) - goto err; - - rc = cifs_alloc_hash("sha512", &p->sha512); - if (rc) - goto err; - - return 0; - -err: - cifs_free_hash(&p->aes_cmac); - cifs_free_hash(&p->hmacsha256); - return rc; -} - - static int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) { From 6d3405415f887aef5774c04ae9fefae63d82bdaf Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 24 Sep 2024 11:34:18 +0200 Subject: [PATCH 302/352] i2c: keba: I2C_KEBA should depend on KEBA_CP500 The KEBA I2C controller is only present on KEBA PLC devices. Hence add a dependency on KEBA_CP500, to prevent asking the user about this driver when configuring a kernel without KEBA CP500 system FPGA support. Fixes: c7e08c816cd2fdf8 ("i2c: keba: Add KEBA I2C controller support") Signed-off-by: Geert Uytterhoeven Reviewed-by: Gerhard Engleder Signed-off-by: Andi Shyti --- drivers/i2c/busses/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 53f18b351f538..6b3ba7e5723aa 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -782,6 +782,7 @@ config I2C_JZ4780 config I2C_KEBA tristate "KEBA I2C controller support" depends on HAS_IOMEM + depends on KEBA_CP500 || COMPILE_TEST select AUXILIARY_BUS help This driver supports the I2C controller found in KEBA system FPGA From 0c8d604dea437b69a861479b413d629bc9b3da70 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 23 Sep 2024 11:42:50 +0800 Subject: [PATCH 303/352] i2c: xiic: Fix pm_runtime_set_suspended() with runtime pm enabled It is not valid to call pm_runtime_set_suspended() for devices with runtime PM enabled because it returns -EAGAIN if it is enabled already and working. So, call pm_runtime_disable() before to fix it. Fixes: 36ecbcab84d0 ("i2c: xiic: Implement power management") Cc: # v4.6+ Signed-off-by: Jinjie Ruan Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-xiic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-xiic.c b/drivers/i2c/busses/i2c-xiic.c index 4c89aad02451d..1d68177241a6b 100644 --- a/drivers/i2c/busses/i2c-xiic.c +++ b/drivers/i2c/busses/i2c-xiic.c @@ -1337,8 +1337,8 @@ static int xiic_i2c_probe(struct platform_device *pdev) return 0; err_pm_disable: - pm_runtime_set_suspended(&pdev->dev); pm_runtime_disable(&pdev->dev); + pm_runtime_set_suspended(&pdev->dev); return ret; } From 54595f2807d203770ee50486cb23dc5763916d72 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 9 Sep 2024 20:38:09 +0000 Subject: [PATCH 304/352] mailbox, remoteproc: omap2+: fix compile testing Selecting CONFIG_OMAP2PLUS_MBOX while compile testing causes a build failure: WARNING: unmet direct dependencies detected for OMAP2PLUS_MBOX Depends on [n]: MAILBOX [=y] && (ARCH_OMAP2PLUS || ARCH_K3) Selected by [m]: - TI_K3_M4_REMOTEPROC [=m] && REMOTEPROC [=y] && (ARCH_K3 || COMPILE_TEST [=y]) Using 'select' to force-enable another subsystem is generally a mistake and causes problems such as this one, so change the three drivers that link against this driver to use 'depends on' instead, and ensure the driver itself can be compile tested regardless of the platform. When compile-testing without CONFIG_TI_SCI_PROTOCOL=m, there is a chance for a link failure, so add a careful dependency on that. arm-linux-gnueabi-ld: drivers/remoteproc/ti_k3_m4_remoteproc.o: in function `k3_m4_rproc_probe': ti_k3_m4_remoteproc.c:(.text.k3_m4_rproc_probe+0x76): undefined reference to `devm_ti_sci_get_by_phandle' Fixes: ebcf9008a895 ("remoteproc: k3-m4: Add a remoteproc driver for M4F subsystem") Signed-off-by: Arnd Bergmann Reviewed-by: Mathieu Poirier Reviewed-by: Andrew Davis Reviewed-by: Martyn Welch Signed-off-by: Jassi Brar --- drivers/mailbox/Kconfig | 2 +- drivers/mailbox/omap-mailbox.c | 2 +- drivers/remoteproc/Kconfig | 6 ++---- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig index cbd9206cd7de3..6fb995778636a 100644 --- a/drivers/mailbox/Kconfig +++ b/drivers/mailbox/Kconfig @@ -74,7 +74,7 @@ config ARMADA_37XX_RWTM_MBOX config OMAP2PLUS_MBOX tristate "OMAP2+ Mailbox framework support" - depends on ARCH_OMAP2PLUS || ARCH_K3 + depends on ARCH_OMAP2PLUS || ARCH_K3 || COMPILE_TEST help Mailbox implementation for OMAP family chips with hardware for interprocessor communication involving DSP, IVA1.0 and IVA2 in diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c index 7a87424657a15..6797770474a55 100644 --- a/drivers/mailbox/omap-mailbox.c +++ b/drivers/mailbox/omap-mailbox.c @@ -603,7 +603,7 @@ static struct platform_driver omap_mbox_driver = { .driver = { .name = "omap-mailbox", .pm = &omap_mbox_pm_ops, - .of_match_table = of_match_ptr(omap_mailbox_of_match), + .of_match_table = omap_mailbox_of_match, }, }; module_platform_driver(omap_mbox_driver); diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig index dda2ada215b7c..7ab80fe511a76 100644 --- a/drivers/remoteproc/Kconfig +++ b/drivers/remoteproc/Kconfig @@ -330,8 +330,7 @@ config STM32_RPROC config TI_K3_DSP_REMOTEPROC tristate "TI K3 DSP remoteproc support" depends on ARCH_K3 - select MAILBOX - select OMAP2PLUS_MBOX + depends on OMAP2PLUS_MBOX help Say m here to support TI's C66x and C71x DSP remote processor subsystems on various TI K3 family of SoCs through the remote @@ -343,8 +342,7 @@ config TI_K3_DSP_REMOTEPROC config TI_K3_R5_REMOTEPROC tristate "TI K3 R5 remoteproc support" depends on ARCH_K3 - select MAILBOX - select OMAP2PLUS_MBOX + depends on OMAP2PLUS_MBOX help Say m here to support TI's R5F remote processor subsystems on various TI K3 family of SoCs through the remote processor From efbc6bd090f48ccf64f7a8dd5daea775821d57ec Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 27 Sep 2024 11:45:45 -0400 Subject: [PATCH 305/352] Documentation: KVM: fix warning in "make htmldocs" The warning Documentation/virt/kvm/locking.rst:31: ERROR: Unexpected indentation. is caused by incorrectly treating a line as the continuation of a paragraph, rather than as the first line in a bullet list. Fixed: 44d174596260 ("KVM: Use dedicated mutex to protect kvm_usage_count to avoid deadlock") Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/locking.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index be3c323888b19..20a9a37d1cdd2 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -27,6 +27,7 @@ The acquisition orders for mutexes are as follows: must not take either kvm->slots_lock or kvm->slots_arch_lock. cpus_read_lock() vs kvm_lock: + - Taking cpus_read_lock() outside of kvm_lock is problematic, despite that being the official ordering, as it is quite easy to unknowingly trigger cpus_read_lock() while holding kvm_lock. Use caution when walking vm_list, From 4b721fcc094e9eb6dd4702df8d79ab11e120833d Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 24 Sep 2024 13:47:23 +0200 Subject: [PATCH 306/352] selftests: vDSO: align stack for O2-optimized memcpy When switching on -O2, gcc generates SSE2 instructions that assume a 16-byte aligned stack, which the standalone test's start point wasn't aligning. Fix this with the usual alignment sequence. Fixes: ecb8bd70d51 ("selftests: vDSO: build tests with O2 optimization") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202409241558.98e13f6f-oliver.sang@intel.com Signed-off-by: Jason A. Donenfeld Signed-off-by: Shuah Khan --- tools/testing/selftests/vDSO/vdso_standalone_test_x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c index 27f6fdf119691..644915862af88 100644 --- a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c +++ b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c @@ -131,6 +131,8 @@ asm ( "_start:\n\t" #ifdef __x86_64__ "mov %rsp,%rdi\n\t" + "and $-16,%rsp\n\t" + "sub $8,%rsp\n\t" "jmp c_main" #else "push %esp\n\t" From 9cf14f5a2746c19455ce9cb44341b5527b5e19c3 Mon Sep 17 00:00:00 2001 From: Andrey Shumilin Date: Fri, 27 Sep 2024 22:34:24 +0300 Subject: [PATCH 307/352] fbdev: sisfb: Fix strbuf array overflow The values of the variables xres and yres are placed in strbuf. These variables are obtained from strbuf1. The strbuf1 array contains digit characters and a space if the array contains non-digit characters. Then, when executing sprintf(strbuf, "%ux%ux8", xres, yres); more than 16 bytes will be written to strbuf. It is suggested to increase the size of the strbuf array to 24. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Andrey Shumilin Signed-off-by: Helge Deller --- drivers/video/fbdev/sis/sis_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/sis/sis_main.c b/drivers/video/fbdev/sis/sis_main.c index 009bf1d926448..75033e6be15ab 100644 --- a/drivers/video/fbdev/sis/sis_main.c +++ b/drivers/video/fbdev/sis/sis_main.c @@ -183,7 +183,7 @@ static void sisfb_search_mode(char *name, bool quiet) { unsigned int j = 0, xres = 0, yres = 0, depth = 0, rate = 0; int i = 0; - char strbuf[16], strbuf1[20]; + char strbuf[24], strbuf1[20]; char *nameptr = name; /* We don't know the hardware specs yet and there is no ivideo */ From f890c8513f45e06c96ac225db3cdfa34e3be5f45 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 16:40:47 -0400 Subject: [PATCH 308/352] bcachefs: Mark inode errors as autofix Most or all errors will be autofix in the future, we're currently just doing the ones that we know are well tested. Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors_format.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index f0c14702f9e62..4e4e132d6d232 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -213,19 +213,19 @@ enum bch_fsck_flags { x(inode_checksum_type_invalid, 199, 0) \ x(inode_compression_type_invalid, 200, 0) \ x(inode_subvol_root_but_not_dir, 201, 0) \ - x(inode_i_size_dirty_but_clean, 202, 0) \ - x(inode_i_sectors_dirty_but_clean, 203, 0) \ - x(inode_i_sectors_wrong, 204, 0) \ - x(inode_dir_wrong_nlink, 205, 0) \ - x(inode_dir_multiple_links, 206, 0) \ - x(inode_multiple_links_but_nlink_0, 207, 0) \ - x(inode_wrong_backpointer, 208, 0) \ - x(inode_wrong_nlink, 209, 0) \ - x(inode_unreachable, 210, 0) \ - x(deleted_inode_but_clean, 211, 0) \ - x(deleted_inode_missing, 212, 0) \ - x(deleted_inode_is_dir, 213, 0) \ - x(deleted_inode_not_unlinked, 214, 0) \ + x(inode_i_size_dirty_but_clean, 202, FSCK_AUTOFIX) \ + x(inode_i_sectors_dirty_but_clean, 203, FSCK_AUTOFIX) \ + x(inode_i_sectors_wrong, 204, FSCK_AUTOFIX) \ + x(inode_dir_wrong_nlink, 205, FSCK_AUTOFIX) \ + x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \ + x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ + x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ + x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ + x(inode_unreachable, 210, FSCK_AUTOFIX) \ + x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ + x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ + x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ + x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ x(extent_overlapping, 215, 0) \ x(key_in_missing_inode, 216, 0) \ x(key_in_wrong_inode_type, 217, 0) \ @@ -255,7 +255,7 @@ enum bch_fsck_flags { x(dir_loop, 241, 0) \ x(hash_table_key_duplicate, 242, 0) \ x(hash_table_key_wrong_offset, 243, 0) \ - x(unlinked_inode_not_on_deleted_list, 244, 0) \ + x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \ x(reflink_p_front_pad_bad, 245, 0) \ x(journal_entry_dup_same_device, 246, 0) \ x(inode_bi_subvol_missing, 247, 0) \ @@ -282,8 +282,8 @@ enum bch_fsck_flags { x(btree_ptr_v2_written_0, 268, 0) \ x(subvol_snapshot_bad, 269, 0) \ x(subvol_inode_bad, 270, 0) \ - x(alloc_key_stripe_sectors_wrong, 271, 0) \ - x(accounting_mismatch, 272, 0) \ + x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \ + x(accounting_mismatch, 272, FSCK_AUTOFIX) \ x(accounting_replicas_not_marked, 273, 0) \ x(invalid_btree_id, 274, 0) \ x(alloc_key_io_time_bad, 275, 0) \ From 4a8f8fafbd6ba6f3433c986b00195e0a8dee96bf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 17:30:59 -0400 Subject: [PATCH 309/352] bcachefs: Add extra padding in bkey_make_mut_noupdate() This fixes a kasan splat in propagate_key_to_snapshot_leaves() - varint_decode_fast() does reads (that it never uses) up to 7 bytes past the end of the integer. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 60393e98084d7..6a454f2fa0053 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -220,7 +220,8 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t if (type && k.k->type != type) return ERR_PTR(-ENOENT); - mut = bch2_trans_kmalloc_nomemzero(trans, bytes); + /* extra padding for varint_decode_fast... */ + mut = bch2_trans_kmalloc_nomemzero(trans, bytes + 8); if (!IS_ERR(mut)) { bkey_reassemble(mut, k); From 51b7cc7c0f964fed976399a3ab876ae4a308fb1b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 17:33:02 -0400 Subject: [PATCH 310/352] bcachefs: Improve bch2_is_inode_open() warning message Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 9b3470a975461..89129548aebb6 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -963,7 +963,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c i return ret; } -static bool bch2_inode_open(struct bch_fs *c, struct bpos p) +static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) { subvol_inum inum = { .subvol = snapshot_t(c, p.snapshot)->subvol, @@ -972,7 +972,7 @@ static bool bch2_inode_open(struct bch_fs *c, struct bpos p) /* snapshot tree corruption, can't safely delete */ if (!inum.subvol) { - bch_err_ratelimited(c, "%s(): snapshot %u has no subvol", __func__, p.snapshot); + bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot); return true; } @@ -1057,7 +1057,7 @@ static int check_inode(struct btree_trans *trans, } if (u.bi_flags & BCH_INODE_unlinked && - !bch2_inode_open(c, k.k->p) && + !bch2_inode_is_open(c, k.k->p) && (!c->sb.clean || fsck_err(trans, inode_unlinked_but_clean, "filesystem marked clean, but inode %llu unlinked", From 0696a18a8cc3f0941efe64008a997dc4701f9587 Mon Sep 17 00:00:00 2001 From: Piotr Zalewski Date: Sun, 22 Sep 2024 15:18:01 +0000 Subject: [PATCH 311/352] bcachefs: memset bounce buffer portion to 0 after key_sort_fix_overlapping Zero-initialize part of allocated bounce buffer which wasn't touched by subsequent bch2_key_sort_fix_overlapping to mitigate later uinit-value use KMSAN bug[1]. After applying the patch reproducer still triggers stack overflow[2] but it seems unrelated to the uninit-value use warning. After further investigation it was found that stack overflow occurs because KMSAN adds too many function calls[3]. Backtrace of where the stack magic number gets smashed was added as a reply to syzkaller thread[3]. It was confirmed that task's stack magic number gets smashed after the code path where KSMAN detects uninit-value use is executed, so it can be assumed that it doesn't contribute in any way to uninit-value use detection. [1] https://syzkaller.appspot.com/bug?extid=6f655a60d3244d0c6718 [2] https://lore.kernel.org/lkml/66e57e46.050a0220.115905.0002.GAE@google.com [3] https://lore.kernel.org/all/rVaWgPULej8K7HqMPNIu8kVNyXNjjCiTB-QBtItLFBmk0alH6fV2tk4joVPk97Evnuv4ZRDd8HB5uDCkiFG6u81xKdzDj-KrtIMJSlF6Kt8=@proton.me Reported-by: syzbot+6f655a60d3244d0c6718@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=6f655a60d3244d0c6718 Fixes: ec4edd7b9d20 ("bcachefs: Prep work for variable size btree node buffers") Suggested-by: Kent Overstreet Signed-off-by: Piotr Zalewski Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index cb48a9477514c..2a0420605bd7f 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1195,6 +1195,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, set_btree_bset(b, b->set, &b->data->keys); b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); + memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0, + btree_buf_bytes(b) - + sizeof(struct btree_node) - + b->nr.live_u64s * sizeof(u64)); u64s = le16_to_cpu(sorted->keys.u64s); *sorted = *b->data; From 18c520f408fa8f4b7379a108b1676052e82677aa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 18:41:46 -0400 Subject: [PATCH 312/352] bcachefs: Fix error path in check_dirent_inode_dirent() fsck_err() jumps to the fsck_err label when bailing out; need to make sure bp_iter was initialized... Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 89129548aebb6..3508f8b5f06e3 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1758,6 +1758,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; + struct btree_iter bp_iter = { NULL }; int ret = 0; if (inode_points_to_dirent(target, d)) @@ -1770,7 +1771,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, prt_printf(&buf, "\n "), bch2_inode_unpacked_to_text(&buf, target), buf.buf))) - goto out_noiter; + goto err; if (!target->bi_dir && !target->bi_dir_offset) { @@ -1779,7 +1780,6 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, return __bch2_fsck_write_inode(trans, target, target_snapshot); } - struct btree_iter bp_iter = { NULL }; struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); ret = bkey_err(bp_dirent); @@ -1840,7 +1840,6 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, err: fsck_err: bch2_trans_iter_exit(trans, &bp_iter); -out_noiter: printbuf_exit(&buf); bch_err_fn(c, ret); return ret; From c6040447c56496f4929db2d73ee445d898dd8a98 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 18:42:39 -0400 Subject: [PATCH 313/352] bcachefs: Fix srcu warning in check_topology check_topology doesn't need the srcu lock and doesn't use normal btree transactions - we can just drop the srcu lock. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index b5e0692f03c68..40b08fef3c398 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -513,6 +513,8 @@ int bch2_check_topology(struct bch_fs *c) struct bpos pulled_from_scan = POS_MIN; int ret = 0; + bch2_trans_srcu_unlock(trans); + for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { struct btree_root *r = bch2_btree_id_root(c, i); bool reconstructed_root = false; From 40d40c6bea19ff1e40fb3d33b35b354a5b35025f Mon Sep 17 00:00:00 2001 From: Diogo Jahchan Koike Date: Mon, 23 Sep 2024 19:22:14 -0300 Subject: [PATCH 314/352] bcachefs: assign return error when iterating through layout syzbot reported a null ptr deref in __copy_user [0] In __bch2_read_super, when a corrupt backup superblock matches the default opts offset, no error is assigned to ret and the freed superblock gets through, possibly being assigned as the best sb in bch2_fs_open and being later dereferenced, causing a fault. Assign EINVALID to ret when iterating through layout. [0]: https://syzkaller.appspot.com/bug?extid=18a5c5e8a9c856944876 Reported-by: syzbot+18a5c5e8a9c856944876@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=18a5c5e8a9c856944876 Signed-off-by: Diogo Jahchan Koike Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index d86d5dae54c9d..b2e914841b5f9 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -799,8 +799,10 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts, i < layout.sb_offset + layout.nr_superblocks; i++) { offset = le64_to_cpu(*i); - if (offset == opt_get(*opts, sb)) + if (offset == opt_get(*opts, sb)) { + ret = -BCH_ERR_invalid; continue; + } ret = read_one_super(sb, offset, &err); if (!ret) From 2a1df873463a28fe5a053d6245290f9a907a5a17 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 22:06:04 -0400 Subject: [PATCH 315/352] bcachefs: Add snapshot to bch_inode_unpacked this allows for various cleanups in fsck Signed-off-by: Kent Overstreet --- fs/bcachefs/inode.c | 10 ++++++---- fs/bcachefs/inode.h | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 6ac0ff7e074ba..1116db239708f 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -320,9 +320,11 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, int bch2_inode_unpack(struct bkey_s_c k, struct bch_inode_unpacked *unpacked) { - if (likely(k.k->type == KEY_TYPE_inode_v3)) - return bch2_inode_unpack_v3(k, unpacked); - return bch2_inode_unpack_slowpath(k, unpacked); + unpacked->bi_snapshot = k.k->p.snapshot; + + return likely(k.k->type == KEY_TYPE_inode_v3) + ? bch2_inode_unpack_v3(k, unpacked) + : bch2_inode_unpack_slowpath(k, unpacked); } int bch2_inode_peek_nowarn(struct btree_trans *trans, @@ -557,7 +559,7 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out, void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { - prt_printf(out, "inum: %llu ", inode->bi_inum); + prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot); __bch2_inode_unpacked_to_text(out, inode); } diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index f1fcb4c58039a..695abd707cb6a 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -69,6 +69,7 @@ typedef u64 u96; struct bch_inode_unpacked { u64 bi_inum; + u32 bi_snapshot; u64 bi_journal_seq; __le64 bi_hash_seed; u64 bi_size; From 951dd86e7c59a6490d8b6d056d8afd5f0cd49293 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 22:05:14 -0400 Subject: [PATCH 316/352] bcachefs: Fix iterator leak in check_subvol() A couple small error handling fixes Signed-off-by: Kent Overstreet --- fs/bcachefs/subvolume.c | 54 ++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index dbe834cb349f4..6845dde1b3397 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -92,34 +92,32 @@ static int check_subvol(struct btree_trans *trans, } struct bch_inode_unpacked inode; - struct btree_iter inode_iter = {}; - ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode, + ret = bch2_inode_find_by_inum_nowarn_trans(trans, (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) }, - 0); - bch2_trans_iter_exit(trans, &inode_iter); - - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (fsck_err_on(ret, - trans, subvol_to_missing_root, - "subvolume %llu points to missing subvolume root %llu:%u", - k.k->p.offset, le64_to_cpu(subvol.v->inode), - le32_to_cpu(subvol.v->snapshot))) { - ret = bch2_subvolume_delete(trans, iter->pos.offset); - bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); - return ret ?: -BCH_ERR_transaction_restart_nested; - } - - if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, - trans, subvol_root_wrong_bi_subvol, - "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", - inode.bi_inum, inode_iter.k.p.snapshot, - inode.bi_subvol, subvol.k->p.offset)) { - inode.bi_subvol = subvol.k->p.offset; - ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot)); - if (ret) + &inode); + if (!ret) { + if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, + trans, subvol_root_wrong_bi_subvol, + "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", + inode.bi_inum, inode.bi_snapshot, + inode.bi_subvol, subvol.k->p.offset)) { + inode.bi_subvol = subvol.k->p.offset; + ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot)); + if (ret) + goto err; + } + } else if (bch2_err_matches(ret, ENOENT)) { + if (fsck_err(trans, subvol_to_missing_root, + "subvolume %llu points to missing subvolume root %llu:%u", + k.k->p.offset, le64_to_cpu(subvol.v->inode), + le32_to_cpu(subvol.v->snapshot))) { + ret = bch2_subvolume_delete(trans, iter->pos.offset); + bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); + ret = ret ?: -BCH_ERR_transaction_restart_nested; goto err; + } + } else { + goto err; } if (!BCH_SUBVOLUME_SNAP(subvol.v)) { @@ -137,7 +135,7 @@ static int check_subvol(struct btree_trans *trans, "%s: snapshot tree %u not found", __func__, snapshot_tree); if (ret) - return ret; + goto err; if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, trans, subvol_not_master_and_not_snapshot, @@ -147,7 +145,7 @@ static int check_subvol(struct btree_trans *trans, bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); ret = PTR_ERR_OR_ZERO(s); if (ret) - return ret; + goto err; SET_BCH_SUBVOLUME_SNAP(&s->v, true); } From 3125c95ea69141c4cbbde854674c90531034ec47 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Tue, 24 Sep 2024 09:42:24 +0800 Subject: [PATCH 317/352] bcachefs: fast exit when darray_make_room failed In downgrade_table_extra, the return value is needed. When it return failed, we should exit immediately. Fixes: 7773df19c35f ("bcachefs: metadata version bucket_stripe_sectors") Signed-off-by: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-downgrade.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index c7e4cdd3f6a52..6f0493f79959f 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -353,7 +353,9 @@ int bch2_sb_downgrade_update(struct bch_fs *c) for (unsigned i = 0; i < src->nr_errors; i++) dst->errors[i] = cpu_to_le16(src->errors[i]); - downgrade_table_extra(c, &table); + ret = downgrade_table_extra(c, &table); + if (ret) + goto out; if (!dst->recovery_passes[0] && !dst->recovery_passes[1] && From dc5bfdf8eaed76cf527c9477952c535f75e0e499 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Tue, 24 Sep 2024 09:41:46 +0800 Subject: [PATCH 318/352] bcachefs: fix the memory leak in exception case The pointer clean points the memory allocated by kmemdup, when the return value of bch2_sb_clean_validate_late is not zero. The memory pointed by clean is leaked. So we should free it in this case. Fixes: a37ad1a3aba9 ("bcachefs: sb-clean.c") Signed-off-by: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-clean.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index 025848a9c4c03..0052752818045 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -167,6 +167,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) ret = bch2_sb_clean_validate_late(c, clean, READ); if (ret) { + kfree(clean); mutex_unlock(&c->sb_lock); return ERR_PTR(ret); } From b29c30ab48e0395a22ecf0b94443d16a8f493fb6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 24 Sep 2024 19:31:22 -0400 Subject: [PATCH 319/352] bcachefs: Fix incorrect IS_ERR_OR_NULL usage Returning a positive integer instead of an error code causes error paths to become very confused. Closes: syzbot+c0360e8367d6d8d04a66@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_node_scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index b28c649c68389..1e694fedc5da0 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -275,7 +275,7 @@ static int read_btree_nodes(struct find_btree_nodes *f) w->ca = ca; t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); - ret = IS_ERR_OR_NULL(t); + ret = PTR_ERR_OR_ZERO(t); if (ret) { percpu_ref_put(&ca->io_ref); closure_put(&cl); From 22a507d68eb8dc9d05b7e91e9a7c9b2566d48c81 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 22:29:05 -0400 Subject: [PATCH 320/352] bcachefs: kill inode_walker_entry.seen_this_pos dead code Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 3508f8b5f06e3..0b5a009a0c2ef 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -626,7 +626,6 @@ static int ref_visible2(struct bch_fs *c, struct inode_walker_entry { struct bch_inode_unpacked inode; u32 snapshot; - bool seen_this_pos; u64 count; }; @@ -740,9 +739,6 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans, int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); if (ret) return ERR_PTR(ret); - } else if (bkey_cmp(w->last_pos, k.k->p)) { - darray_for_each(w->inodes, i) - i->seen_this_pos = false; } w->last_pos = k.k->p; @@ -1634,8 +1630,6 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, if (bkey_extent_is_allocation(k.k)) i->count += k.k->size; } - - i->seen_this_pos = true; } if (k.k->type != KEY_TYPE_whiteout) { From 3672bda8f5edd8ed23c745965500ceb53286d48d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 22:32:58 -0400 Subject: [PATCH 321/352] bcachefs: fix transaction restart handling in check_extents(), check_dirents() Dealing with outside state within a btree transaction is always tricky. check_extents() and check_dirents() have to accumulate counters for i_sectors and i_nlink (for subdirectories). There were two bugs: - transaction commit may return a restart; therefore we have to commit before accumulating to those counters - get_inode_all_snapshots() may return a transaction restart, before updating w->last_pos; then, on the restart, check_i_sectors()/check_subdir_count() would see inodes that were not for w->last_pos Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 94 +++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 0b5a009a0c2ef..8fe67abae36e3 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -631,6 +631,7 @@ struct inode_walker_entry { struct inode_walker { bool first_this_inode; + bool have_inodes; bool recalculate_sums; struct bpos last_pos; @@ -668,6 +669,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, struct bkey_s_c k; int ret; + /* + * We no longer have inodes for w->last_pos; clear this to avoid + * screwing up check_i_sectors/check_subdir_count if we take a + * transaction restart here: + */ + w->have_inodes = false; w->recalculate_sums = false; w->inodes.nr = 0; @@ -685,6 +692,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, return ret; w->first_this_inode = true; + w->have_inodes = true; return 0; } @@ -1551,10 +1559,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, struct inode_walker *inode, struct snapshots_seen *s, - struct extent_ends *extent_ends) + struct extent_ends *extent_ends, + struct disk_reservation *res) { struct bch_fs *c = trans->c; - struct inode_walker_entry *i; struct printbuf buf = PRINTBUF; int ret = 0; @@ -1564,7 +1572,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, goto out; } - if (inode->last_pos.inode != k.k->p.inode) { + if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) { ret = check_i_sectors(trans, inode); if (ret) goto err; @@ -1574,12 +1582,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; - i = walk_inode(trans, inode, k); - ret = PTR_ERR_OR_ZERO(i); + struct inode_walker_entry *extent_i = walk_inode(trans, inode, k); + ret = PTR_ERR_OR_ZERO(extent_i); if (ret) goto err; - ret = check_key_has_inode(trans, iter, inode, i, k); + ret = check_key_has_inode(trans, iter, inode, extent_i, k); if (ret) goto err; @@ -1588,24 +1596,19 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, &inode->recalculate_sums); if (ret) goto err; - } - - /* - * Check inodes in reverse order, from oldest snapshots to newest, - * starting from the inode that matches this extent's snapshot. If we - * didn't have one, iterate over all inodes: - */ - if (!i) - i = &darray_last(inode->inodes); - for (; - inode->inodes.data && i >= inode->inodes.data; - --i) { - if (i->snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) - continue; + /* + * Check inodes in reverse order, from oldest snapshots to + * newest, starting from the inode that matches this extent's + * snapshot. If we didn't have one, iterate over all inodes: + */ + for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); + inode->inodes.data && i >= inode->inodes.data; + --i) { + if (i->snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) + continue; - if (k.k->type != KEY_TYPE_whiteout) { if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && !bkey_extent_is_reservation(k), @@ -1625,10 +1628,24 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, goto err; iter->k.type = KEY_TYPE_whiteout; + break; } + } + } + + ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + if (bkey_extent_is_allocation(k.k)) { + for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); + inode->inodes.data && i >= inode->inodes.data; + --i) { + if (i->snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) + continue; - if (bkey_extent_is_allocation(k.k)) - i->count += k.k->size; + i->count += k.k->size; } } @@ -1660,13 +1677,11 @@ int bch2_check_extents(struct bch_fs *c) extent_ends_init(&extent_ends); int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_extents, + for_each_btree_key(trans, iter, BTREE_ID_extents, POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, - BCH_TRANS_COMMIT_no_enospc, ({ + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ bch2_disk_reservation_put(c, &res); - check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: + check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?: check_extent_overbig(trans, &iter, k); })) ?: check_i_sectors_notnested(trans, &w)); @@ -2068,7 +2083,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (k.k->type == KEY_TYPE_whiteout) goto out; - if (dir->last_pos.inode != k.k->p.inode) { + if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { ret = check_subdir_count(trans, dir); if (ret) goto err; @@ -2130,11 +2145,15 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; } - - if (d.v->d_type == DT_DIR) - for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) - i->count++; } + + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + if (d.v->d_type == DT_DIR) + for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) + i->count++; out: err: fsck_err: @@ -2157,12 +2176,9 @@ int bch2_check_dirents(struct bch_fs *c) snapshots_seen_init(&s); int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, + for_each_btree_key(trans, iter, BTREE_ID_dirents, POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, - k, - NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?: check_subdir_count_notnested(trans, &dir)); From 1e0272ef4774eed8314e8d10a8856c979deeaf35 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 24 Sep 2024 22:53:56 -0400 Subject: [PATCH 322/352] bcachefs: bch_accounting_mode Minor refactoring - replace multiple bool arguments with an enum; prep work for fixing a bug in accounting read. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 4 ++-- fs/bcachefs/disk_accounting.c | 10 +++++++--- fs/bcachefs/disk_accounting.h | 25 +++++++++++++++++-------- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 91884da4e30a3..4a131e3b62251 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -712,7 +712,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, a->k.version = journal_pos_to_bversion(&trans->journal_res, (u64 *) entry - (u64 *) trans->journal_entries); BUG_ON(bversion_zero(a->k.version)); - ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false, false); + ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal); if (ret) goto revert_fs_usage; } @@ -798,7 +798,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start); bch2_accounting_neg(a); - bch2_accounting_mem_mod_locked(trans, a.c, false, false); + bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); bch2_accounting_neg(a); } percpu_up_read(&c->mark_lock); diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index e972e2bca546a..e9a7f1dab4505 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -319,7 +319,8 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun return -BCH_ERR_ENOMEM_disk_accounting; } -int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc) +int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, + enum bch_accounting_mode mode) { struct bch_replicas_padded r; @@ -566,7 +567,9 @@ int bch2_gc_accounting_done(struct bch_fs *c) struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; accounting_key_init(&k_i.k, &acc_k, src_v, nr); - bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false, false); + bch2_accounting_mem_mod_locked(trans, + bkey_i_to_s_c_accounting(&k_i.k), + BCH_ACCOUNTING_normal); preempt_disable(); struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); @@ -595,7 +598,8 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) return 0; percpu_down_read(&c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), false, true); + int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), + BCH_ACCOUNTING_read); percpu_up_read(&c->mark_lock); if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) && diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index f29fd0dd9581f..243bfb0975eab 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -103,23 +103,35 @@ static inline int accounting_pos_cmp(const void *_l, const void *_r) return bpos_cmp(*l, *r); } -int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool); +enum bch_accounting_mode { + BCH_ACCOUNTING_normal, + BCH_ACCOUNTING_gc, + BCH_ACCOUNTING_read, +}; + +int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); void bch2_accounting_mem_gc(struct bch_fs *); /* * Update in memory counters so they match the btree update we're doing; called * from transaction commit path */ -static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc, bool read) +static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, + struct bkey_s_c_accounting a, + enum bch_accounting_mode mode) { struct bch_fs *c = trans->c; + struct bch_accounting_mem *acc = &c->accounting; struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, a.k->p); + bool gc = mode == BCH_ACCOUNTING_gc; + + EBUG_ON(gc && !acc->gc_running); if (acc_k.type == BCH_DISK_ACCOUNTING_inum) return 0; - if (!gc && !read) { + if (mode == BCH_ACCOUNTING_normal) { switch (acc_k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; @@ -140,14 +152,11 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru } } - struct bch_accounting_mem *acc = &c->accounting; unsigned idx; - EBUG_ON(gc && !acc->gc_running); - while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { - int ret = bch2_accounting_mem_insert(c, a, gc); + int ret = bch2_accounting_mem_insert(c, a, mode); if (ret) return ret; } @@ -164,7 +173,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) { percpu_down_read(&trans->c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, a, gc, false); + int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal); percpu_up_read(&trans->c->mark_lock); return ret; } From 9104fc1928704ee5708ec7f43ab8dfbdc66e2ce8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 25 Sep 2024 16:46:06 -0400 Subject: [PATCH 323/352] bcachefs: Fix accounting read + device removal accounting read was checking if accounting replicas entries were marked in the superblock prior to applying accounting from the journal, which meant that a recently removed device could spuriously trigger a "not marked in superblocked" error (when journal entries zero out the offending counter). Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 47 +++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index e9a7f1dab4505..6e5e8b6f71822 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -324,7 +324,8 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, { struct bch_replicas_padded r; - if (accounting_to_replicas(&r.e, a.k->p) && + if (mode != BCH_ACCOUNTING_read && + accounting_to_replicas(&r.e, a.k->p) && !bch2_replicas_marked_locked(c, &r.e)) return -BCH_ERR_btree_insert_need_mark_replicas; @@ -592,7 +593,6 @@ int bch2_gc_accounting_done(struct bch_fs *c) static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; if (k.k->type != KEY_TYPE_accounting) return 0; @@ -601,22 +601,6 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), BCH_ACCOUNTING_read); percpu_up_read(&c->mark_lock); - - if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) && - ret == -BCH_ERR_btree_insert_need_mark_replicas) - ret = 0; - - struct disk_accounting_pos acc; - bpos_to_disk_accounting_pos(&acc, k.k->p); - - if (fsck_err_on(ret == -BCH_ERR_btree_insert_need_mark_replicas, - trans, accounting_replicas_not_marked, - "accounting not marked in superblock replicas\n %s", - (bch2_accounting_key_to_text(&buf, &acc), - buf.buf))) - ret = bch2_accounting_update_sb_one(c, k.k->p); -fsck_err: - printbuf_exit(&buf); return ret; } @@ -628,6 +612,7 @@ int bch2_accounting_read(struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; struct btree_trans *trans = bch2_trans_get(c); + struct printbuf buf = PRINTBUF; int ret = for_each_btree_key(trans, iter, BTREE_ID_accounting, POS_MIN, @@ -678,6 +663,30 @@ int bch2_accounting_read(struct bch_fs *c) keys->gap = keys->nr = dst - keys->data; percpu_down_read(&c->mark_lock); + for (unsigned i = 0; i < acc->k.nr; i++) { + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); + + if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters)) + continue; + + struct bch_replicas_padded r; + + if (!accounting_to_replicas(&r.e, acc->k.data[i].pos)) + continue; + + struct disk_accounting_pos k; + bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); + + if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), + trans, accounting_replicas_not_marked, + "accounting not marked in superblock replicas\n %s", + (printbuf_reset(&buf), + bch2_accounting_key_to_text(&buf, &k), + buf.buf))) + ret = bch2_accounting_update_sb_one(c, acc->k.data[i].pos); + } + preempt_disable(); struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); @@ -713,8 +722,10 @@ int bch2_accounting_read(struct bch_fs *c) } } preempt_enable(); +fsck_err: percpu_up_read(&c->mark_lock); err: + printbuf_exit(&buf); bch2_trans_put(trans); bch_err_fn(c, ret); return ret; From 49fd90b2cc332b8607a616d99d4bb792f18208b9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 25 Sep 2024 18:17:31 -0400 Subject: [PATCH 324/352] bcachefs: Fix unlocked access to c->disk_sb.sb in bch2_replicas_entry_validate() Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 2 +- fs/bcachefs/replicas.c | 18 ++++++++++++++---- fs/bcachefs/replicas.h | 2 +- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 30460bce04bec..954f6a96e0f4d 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -605,7 +605,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, goto out; } - if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err), + if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), c, version, jset, entry, journal_entry_data_usage_bad_size, "invalid journal entry usage: %s", err.buf)) { diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 998c0bd068023..bcb3276747e00 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -66,9 +66,9 @@ void bch2_replicas_entry_to_text(struct printbuf *out, prt_printf(out, "]"); } -int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, - struct bch_sb *sb, - struct printbuf *err) +static int bch2_replicas_entry_validate_locked(struct bch_replicas_entry_v1 *r, + struct bch_sb *sb, + struct printbuf *err) { if (!r->nr_devs) { prt_printf(err, "no devices in entry "); @@ -94,6 +94,16 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, return -BCH_ERR_invalid_replicas_entry; } +int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, + struct bch_fs *c, + struct printbuf *err) +{ + mutex_lock(&c->sb_lock); + int ret = bch2_replicas_entry_validate_locked(r, c->disk_sb.sb, err); + mutex_unlock(&c->sb_lock); + return ret; +} + void bch2_cpu_replicas_to_text(struct printbuf *out, struct bch_replicas_cpu *r) { @@ -676,7 +686,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, struct bch_replicas_entry_v1 *e = cpu_replicas_entry(cpu_r, i); - int ret = bch2_replicas_entry_validate(e, sb, err); + int ret = bch2_replicas_entry_validate_locked(e, sb, err); if (ret) return ret; diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h index 622482559c3db..5aba2c1ce1331 100644 --- a/fs/bcachefs/replicas.h +++ b/fs/bcachefs/replicas.h @@ -10,7 +10,7 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *); void bch2_replicas_entry_to_text(struct printbuf *, struct bch_replicas_entry_v1 *); int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *, - struct bch_sb *, struct printbuf *); + struct bch_fs *, struct printbuf *); void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); static inline struct bch_replicas_entry_v1 * From 431312b59cf54f9cc99352d0c3d80ed30e9b7df5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 25 Sep 2024 18:17:52 -0400 Subject: [PATCH 325/352] bcachefs: Fix disk accounting attempting to mark invalid replicas entry This fixes the following bug, where a disk accounting key has an invalid replicas entry, and we attempt to add it to the superblock: bcachefs (3c0860e8-07ca-4276-8954-11c1774be868): starting version 1.12: rebalance_work_acct_fix opts=metadata_replicas=2,data_replicas=2,foreground_target=ssd,background_target=hdd,nopromote_whole_extents,verbose,fsck,fix_errors=yes bcachefs (3c0860e8-07ca-4276-8954-11c1774be868): recovering from clean shutdown, journal seq 15211644 bcachefs (3c0860e8-07ca-4276-8954-11c1774be868): accounting_read... accounting not marked in superblock replicas replicas cached: 1/1 [0], fixing bcachefs (3c0860e8-07ca-4276-8954-11c1774be868): sb invalid before write: Invalid superblock section replicas_v0: invalid device 0 in entry cached: 1/1 [0] replicas_v0 (size 88): user: 2 [3 5] user: 2 [1 4] cached: 1 [2] btree: 2 [1 2] user: 2 [2 5] cached: 1 [0] cached: 1 [4] journal: 2 [1 5] user: 2 [1 2] user: 2 [2 3] user: 2 [3 4] user: 2 [4 5] cached: 1 [1] cached: 1 [3] cached: 1 [5] journal: 2 [1 2] journal: 2 [2 5] btree: 2 [2 5] user: 2 [1 3] user: 2 [1 5] user: 2 [2 4] bcachefs (3c0860e8-07ca-4276-8954-11c1774be868): inconsistency detected - emergency read only at journal seq 15211644 accounting not marked in superblock replicas replicas user: 1/1 [3], fixing bcachefs (3c0860e8-07ca-4276-8954-11c1774be868): sb invalid before write: Invalid superblock section replicas_v0: invalid device 0 in entry cached: 1/1 [0] replicas_v0 (size 96): user: 2 [3 5] user: 2 [1 3] cached: 1 [2] btree: 2 [1 2] user: 2 [2 4] cached: 1 [0] cached: 1 [4] journal: 2 [1 5] user: 1 [3] user: 2 [1 5] user: 2 [3 4] user: 2 [4 5] cached: 1 [1] cached: 1 [3] cached: 1 [5] journal: 2 [1 2] journal: 2 [2 5] btree: 2 [2 5] user: 2 [1 2] user: 2 [1 4] user: 2 [2 3] user: 2 [2 5] accounting not marked in superblock replicas replicas user: 1/2 [3 7], fixing bcachefs (3c0860e8-07ca-4276-8954-11c1774be868): sb invalid before write: Invalid superblock section replicas_v0: invalid device 7 in entry user: 1/2 [3 7] replicas_v0 (size 96): user: 2 [3 7] user: 2 [1 3] cached: 1 [2] btree: 2 [1 2] user: 2 [2 4] cached: 1 [0] cached: 1 [4] journal: 2 [1 5] user: 1 [3] user: 2 [1 5] user: 2 [3 4] user: 2 [4 5] cached: 1 [1] cached: 1 [3] cached: 1 [5] journal: 2 [1 2] journal: 2 [2 5] btree: 2 [2 5] user: 2 [1 2] user: 2 [1 4] user: 2 [2 3] user: 2 [2 5] user: 2 [3 5] done bcachefs (3c0860e8-07ca-4276-8954-11c1774be868): alloc_read... done bcachefs (3c0860e8-07ca-4276-8954-11c1774be868): stripes_read... done bcachefs (3c0860e8-07ca-4276-8954-11c1774be868): snapshots_read... done Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 6e5e8b6f71822..669214127c16a 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -671,10 +671,16 @@ int bch2_accounting_read(struct bch_fs *c) continue; struct bch_replicas_padded r; - if (!accounting_to_replicas(&r.e, acc->k.data[i].pos)) continue; + /* + * If the replicas entry is invalid it'll get cleaned up by + * check_allocations: + */ + if (bch2_replicas_entry_validate(&r.e, c, &buf)) + continue; + struct disk_accounting_pos k; bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); @@ -683,8 +689,17 @@ int bch2_accounting_read(struct bch_fs *c) "accounting not marked in superblock replicas\n %s", (printbuf_reset(&buf), bch2_accounting_key_to_text(&buf, &k), - buf.buf))) - ret = bch2_accounting_update_sb_one(c, acc->k.data[i].pos); + buf.buf))) { + /* + * We're not RW yet and still single threaded, dropping + * and retaking lock is ok: + */ + percpu_up_read(&c->mark_lock); + ret = bch2_mark_replicas(c, &r.e); + if (ret) + goto fsck_err; + percpu_down_read(&c->mark_lock); + } } preempt_disable(); From 7c980a43e936e32741a62bf5a047c5f5ad572ec8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 26 Sep 2024 16:50:29 -0400 Subject: [PATCH 326/352] bcachefs: Move transaction commit path validation to as late as possible In order to check for accounting keys with version=0, we need to run validation after they've been assigned version numbers. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 68 ++++++++++++++++---------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 4a131e3b62251..145d82aed4ecb 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -735,6 +735,40 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, goto fatal_err; } + trans_for_each_update(trans, i) { + enum bch_validate_flags invalid_flags = 0; + + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) + invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; + + ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), + i->bkey_type, invalid_flags); + if (unlikely(ret)){ + bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", + trans->fn, (void *) i->ip_allocated); + goto fatal_err; + } + btree_insert_entry_checks(trans, i); + } + + for (struct jset_entry *i = trans->journal_entries; + i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + i = vstruct_next(i)) { + enum bch_validate_flags invalid_flags = 0; + + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) + invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; + + ret = bch2_journal_entry_validate(c, NULL, i, + bcachefs_metadata_version_current, + CPU_BIG_ENDIAN, invalid_flags); + if (unlikely(ret)) { + bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", + trans->fn); + goto fatal_err; + } + } + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { struct journal *j = &c->journal; struct jset_entry *entry; @@ -1019,40 +1053,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (ret) goto out_reset; - trans_for_each_update(trans, i) { - enum bch_validate_flags invalid_flags = 0; - - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; - - ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), - i->bkey_type, invalid_flags); - if (unlikely(ret)){ - bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", - trans->fn, (void *) i->ip_allocated); - return ret; - } - btree_insert_entry_checks(trans, i); - } - - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - i = vstruct_next(i)) { - enum bch_validate_flags invalid_flags = 0; - - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; - - ret = bch2_journal_entry_validate(c, NULL, i, - bcachefs_metadata_version_current, - CPU_BIG_ENDIAN, invalid_flags); - if (unlikely(ret)) { - bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", - trans->fn); - return ret; - } - } - if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { ret = do_bch2_trans_commit_to_journal_replay(trans); goto out_reset; From 5612daafb76420c6793dc48ce6d0c20f36cc7981 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 26 Sep 2024 16:51:19 -0400 Subject: [PATCH 327/352] bcachefs: Fix fsck warnings from bkey validation __bch2_fsck_err() warns if the current task has a btree_trans object and it wasn't passed in, because if it has to prompt for user input it has to be able to unlock it. But plumbing the btree_trans through bkey_validate(), as well as transaction restarts, is problematic - so instead make bkey fsck errors FSCK_AUTOFIX, which doesn't need to warn. Signed-off-by: Kent Overstreet --- fs/bcachefs/error.c | 14 +++++++++++++- fs/bcachefs/error.h | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 95afa7bf20205..3a16b535b6c32 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -239,7 +239,19 @@ int __bch2_fsck_err(struct bch_fs *c, if (!c) c = trans->c; - WARN_ON(!trans && bch2_current_has_btree_trans(c)); + /* + * Ugly: if there's a transaction in the current task it has to be + * passed in to unlock if we prompt for user input. + * + * But, plumbing a transaction and transaction restarts into + * bkey_validate() is problematic. + * + * So: + * - make all bkey errors AUTOFIX, they're simple anyways (we just + * delete the key) + * - and we don't need to warn if we're not prompting + */ + WARN_ON(!(flags & FSCK_AUTOFIX) && !trans && bch2_current_has_btree_trans(c)); if ((flags & FSCK_CAN_FIX) && test_bit(err, c->sb.errors_silent)) diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 2f1b86978f366..21ee7211b03e8 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -184,7 +184,7 @@ do { \ ret = -BCH_ERR_fsck_delete_bkey; \ goto fsck_err; \ } \ - int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX, \ + int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX|FSCK_AUTOFIX,\ BCH_FSCK_ERR_##_err_type, \ _err_msg, ##__VA_ARGS__); \ if (_ret != -BCH_ERR_fsck_fix && \ From 8d65b15f8d93638cfa9dae20a4274d5059c3b9d2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 26 Sep 2024 15:30:17 -0400 Subject: [PATCH 328/352] bcachefs: Fix BCH_SB_ERRS() so we can reorder BCH_SB_ERRS() has a field for the actual enum val so that we can reorder to reorganize, but the way BCH_SB_ERR_MAX was defined didn't allow for this. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 +- fs/bcachefs/sb-downgrade.c | 5 ++--- fs/bcachefs/sb-errors.c | 6 +++--- fs/bcachefs/sb-errors.h | 2 ++ fs/bcachefs/sb-errors_format.h | 2 +- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index c711d4c27a03f..e6dd4812c0203 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -776,7 +776,7 @@ struct bch_fs { unsigned nsec_per_time_unit; u64 features; u64 compat; - unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)]; + unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)]; u64 btrees_lost_data; } sb; diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 6f0493f79959f..5102059a0f1dc 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -312,8 +312,7 @@ static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb, if (!first) prt_char(out, ','); first = false; - unsigned e = le16_to_cpu(i->errors[j]); - prt_str(out, e < BCH_SB_ERR_MAX ? bch2_sb_error_strs[e] : "(unknown)"); + bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j])); } prt_newline(out); } @@ -401,7 +400,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) { unsigned e = le16_to_cpu(i->errors[j]); - if (e < BCH_SB_ERR_MAX) + if (e < BCH_FSCK_ERR_MAX) __set_bit(e, c->sb.errors_silent); if (e < sizeof(ext->errors_silent) * 8) __set_bit_le64(e, ext->errors_silent); diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c index c1270d790e43b..013a96883b4e2 100644 --- a/fs/bcachefs/sb-errors.c +++ b/fs/bcachefs/sb-errors.c @@ -7,12 +7,12 @@ const char * const bch2_sb_error_strs[] = { #define x(t, n, ...) [n] = #t, BCH_SB_ERRS() - NULL +#undef x }; -static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id) +void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id) { - if (id < BCH_SB_ERR_MAX) + if (id < BCH_FSCK_ERR_MAX) prt_str(out, bch2_sb_error_strs[id]); else prt_printf(out, "(unknown error %u)", id); diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h index 8889001e7db4b..b2357b8e6107b 100644 --- a/fs/bcachefs/sb-errors.h +++ b/fs/bcachefs/sb-errors.h @@ -6,6 +6,8 @@ extern const char * const bch2_sb_error_strs[]; +void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id); + extern const struct bch_sb_field_ops bch_sb_field_ops_errors; void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id); diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 4e4e132d6d232..49d8609c4a08a 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -292,12 +292,12 @@ enum bch_fsck_flags { x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \ x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ + x(MAX, 281, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, BCH_SB_ERRS() #undef x - BCH_SB_ERR_MAX }; struct bch_sb_field_errors { From fd65378db9998a6deafdc4910ee1b01b377d6fee Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 26 Sep 2024 15:19:17 -0400 Subject: [PATCH 329/352] bcachefs: Don't delete unlinked inodes before logged op resume Previously, check_inode() would delete unlinked inodes if they weren't on the deleted list - this code dating from before there was a deleted list. But, if we crash during a logged op (truncate or finsert/fcollapse) of an unlinked file, logged op resume will get confused if the inode has already been deleted - instead, just add it to the deleted list if it needs to be there; delete_dead_inodes runs after logged op resume. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 49 ++++++++++++++++++----------- fs/bcachefs/recovery_passes_types.h | 2 +- fs/bcachefs/sb-errors_format.h | 3 +- fs/bcachefs/super-io.c | 3 +- 4 files changed, 36 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 8fe67abae36e3..10e2930b62da3 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1049,26 +1049,39 @@ static int check_inode(struct btree_trans *trans, } if (u.bi_flags & BCH_INODE_unlinked) { - ret = check_inode_deleted_list(trans, k.k->p); - if (ret < 0) - return ret; + if (!test_bit(BCH_FS_started, &c->flags)) { + /* + * If we're not in online fsck, don't delete unlinked + * inodes, just make sure they're on the deleted list. + * + * They might be referred to by a logged operation - + * i.e. we might have crashed in the middle of a + * truncate on an unlinked but open file - so we want to + * let the delete_dead_inodes kill it after resuming + * logged ops. + */ + ret = check_inode_deleted_list(trans, k.k->p); + if (ret < 0) + return ret; - fsck_err_on(!ret, - trans, unlinked_inode_not_on_deleted_list, - "inode %llu:%u unlinked, but not on deleted list", - u.bi_inum, k.k->p.snapshot); - ret = 0; - } + fsck_err_on(!ret, + trans, unlinked_inode_not_on_deleted_list, + "inode %llu:%u unlinked, but not on deleted list", + u.bi_inum, k.k->p.snapshot); - if (u.bi_flags & BCH_INODE_unlinked && - !bch2_inode_is_open(c, k.k->p) && - (!c->sb.clean || - fsck_err(trans, inode_unlinked_but_clean, - "filesystem marked clean, but inode %llu unlinked", - u.bi_inum))) { - ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); - bch_err_msg(c, ret, "in fsck deleting inode"); - return ret; + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1); + if (ret) + goto err; + } else { + if (fsck_err_on(bch2_inode_is_open(c, k.k->p), + trans, inode_unlinked_and_not_open, + "inode %llu%u unlinked and not open", + u.bi_inum, u.bi_snapshot)) { + ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); + bch_err_msg(c, ret, "in fsck deleting inode"); + return ret; + } + } } if (u.bi_flags & BCH_INODE_i_size_dirty && diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 8c7dee5983d27..50406ce0e4ef1 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -50,7 +50,7 @@ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ x(resume_logged_ops, 23, PASS_ALWAYS) \ - x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ + x(delete_dead_inodes, 32, PASS_ALWAYS) \ x(fix_reflink_p, 33, 0) \ x(set_fs_needs_rebalance, 34, 0) \ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 49d8609c4a08a..3fca1d836318a 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -210,6 +210,7 @@ enum bch_fsck_flags { x(inode_snapshot_mismatch, 196, 0) \ x(inode_unlinked_but_clean, 197, 0) \ x(inode_unlinked_but_nlink_nonzero, 198, 0) \ + x(inode_unlinked_and_not_open, 281, 0) \ x(inode_checksum_type_invalid, 199, 0) \ x(inode_compression_type_invalid, 200, 0) \ x(inode_subvol_root_but_not_dir, 201, 0) \ @@ -292,7 +293,7 @@ enum bch_fsck_flags { x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \ x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ - x(MAX, 281, 0) + x(MAX, 282, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index b2e914841b5f9..ce7410d720892 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -1190,7 +1190,8 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8); prt_printf(out, "Errors to silently fix:\t"); - prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, sizeof(e->errors_silent) * 8); + prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, + min(BCH_FSCK_ERR_MAX, sizeof(e->errors_silent) * 8)); prt_newline(out); kfree(errors_silent); From cf49f8a8c277f9f2b78e2a56189a741a508a9820 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 26 Sep 2024 15:49:17 -0400 Subject: [PATCH 330/352] bcachefs: rename version -> bversion give bversions a more distinct name, to aid in grepping Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 2 +- fs/bcachefs/bcachefs_format.h | 6 +++--- fs/bcachefs/bkey.h | 4 ++-- fs/bcachefs/bkey_methods.c | 2 +- fs/bcachefs/bkey_methods.h | 2 +- fs/bcachefs/btree_gc.c | 6 +++--- fs/bcachefs/btree_io.c | 2 +- fs/bcachefs/btree_trans_commit.c | 8 ++++---- fs/bcachefs/data_update.c | 2 +- fs/bcachefs/disk_accounting.c | 6 +++--- fs/bcachefs/disk_accounting.h | 4 ++-- fs/bcachefs/disk_accounting_types.h | 2 +- fs/bcachefs/io_read.c | 4 ++-- fs/bcachefs/io_write.c | 4 ++-- fs/bcachefs/recovery.c | 2 +- fs/bcachefs/reflink.c | 2 +- fs/bcachefs/tests.c | 2 +- 17 files changed, 30 insertions(+), 30 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index e11989a57ca0f..47455a85c9092 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -501,7 +501,7 @@ static int check_extent_checksum(struct btree_trans *trans, prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree)); bch2_bkey_val_to_text(&buf, c, extent2); - struct nonce nonce = extent_nonce(extent.k->version, p.crc); + struct nonce nonce = extent_nonce(extent.k->bversion, p.crc); struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), trans, dup_backpointer_to_bad_csum_extent, diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 8c4addddd07e0..203ee627cab5d 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -217,7 +217,7 @@ struct bkey { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ __u8 pad[1]; - struct bversion version; + struct bversion bversion; __u32 size; /* extent size, in sectors */ struct bpos p; #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ @@ -328,8 +328,8 @@ enum bch_bkey_fields { bkey_format_field(OFFSET, p.offset), \ bkey_format_field(SNAPSHOT, p.snapshot), \ bkey_format_field(SIZE, size), \ - bkey_format_field(VERSION_HI, version.hi), \ - bkey_format_field(VERSION_LO, version.lo), \ + bkey_format_field(VERSION_HI, bversion.hi), \ + bkey_format_field(VERSION_LO, bversion.lo), \ }, \ }) diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index e34cb2bf329c5..6e5092d4c62eb 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -554,8 +554,8 @@ static inline void bch2_bkey_pack_test(void) {} x(BKEY_FIELD_OFFSET, p.offset) \ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ x(BKEY_FIELD_SIZE, size) \ - x(BKEY_FIELD_VERSION_HI, version.hi) \ - x(BKEY_FIELD_VERSION_LO, version.lo) + x(BKEY_FIELD_VERSION_HI, bversion.hi) \ + x(BKEY_FIELD_VERSION_LO, bversion.lo) struct bkey_format_state { u64 field_min[BKEY_NR_FIELDS]; diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 88d8958281e80..e7ac227ba7e89 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -289,7 +289,7 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) bch2_bpos_to_text(out, k->p); - prt_printf(out, " len %u ver %llu", k->size, k->version.lo); + prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo); } else { prt_printf(out, "(null)"); } diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 3df3dd2723a12..018fb72e32d39 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -70,7 +70,7 @@ bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r) { return l->type == r->type && - !bversion_cmp(l->version, r->version) && + !bversion_cmp(l->bversion, r->bversion) && bpos_eq(l->p, bkey_start_pos(r)); } diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 40b08fef3c398..660d2fa02da21 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -601,15 +601,15 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, if (initial) { BUG_ON(bch2_journal_seq_verify && - k.k->version.lo > atomic64_read(&c->journal.seq)); + k.k->bversion.lo > atomic64_read(&c->journal.seq)); if (fsck_err_on(btree_id != BTREE_ID_accounting && - k.k->version.lo > atomic64_read(&c->key_version), + k.k->bversion.lo > atomic64_read(&c->key_version), trans, bkey_version_in_future, "key version number higher than recorded %llu\n %s", atomic64_read(&c->key_version), (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - atomic64_set(&c->key_version, k.k->version.lo); + atomic64_set(&c->key_version, k.k->bversion.lo); } if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 2a0420605bd7f..1c1448b52207b 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1223,7 +1223,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, ret = bch2_bkey_val_validate(c, u.s_c, READ); if (ret == -BCH_ERR_fsck_delete_bkey || (bch2_inject_invalid_keys && - !bversion_cmp(u.k->version, MAX_VERSION))) { + !bversion_cmp(u.k->bversion, MAX_VERSION))) { btree_keys_account_key_drop(&b->nr, 0, k); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 145d82aed4ecb..0989e2da6e56d 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -684,10 +684,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, !(flags & BCH_TRANS_COMMIT_no_journal_res)) { if (bch2_journal_seq_verify) trans_for_each_update(trans, i) - i->k->k.version.lo = trans->journal_res.seq; + i->k->k.bversion.lo = trans->journal_res.seq; else if (bch2_inject_invalid_keys) trans_for_each_update(trans, i) - i->k->k.version = MAX_VERSION; + i->k->k.bversion = MAX_VERSION; } h = trans->hooks; @@ -709,9 +709,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (jset_entry_is_key(entry) && entry->start->k.type == KEY_TYPE_accounting) { struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start); - a->k.version = journal_pos_to_bversion(&trans->journal_res, + a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) entry - (u64 *) trans->journal_entries); - BUG_ON(bversion_zero(a->k.version)); + BUG_ON(bversion_zero(a->k.bversion)); ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal); if (ret) goto revert_fs_usage; diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 757b9884ef558..462b1a2fe1ad8 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -639,7 +639,7 @@ int bch2_data_update_init(struct btree_trans *trans, bch2_write_op_init(&m->op, c, io_opts); m->op.pos = bkey_start_pos(k.k); - m->op.version = k.k->version; + m->op.version = k.k->bversion; m->op.target = data_opts.target; m->op.write_point = wp; m->op.nr_replicas = 0; diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 669214127c16a..9ac45a607b931 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -291,7 +291,7 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun struct accounting_mem_entry n = { .pos = a.k->p, - .version = a.k->version, + .bversion = a.k->bversion, .nr_counters = bch2_accounting_counters(a.k), .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), sizeof(u64), GFP_KERNEL), @@ -636,7 +636,7 @@ int bch2_accounting_read(struct bch_fs *c) accounting_pos_cmp, &k.k->p); bool applied = idx < acc->k.nr && - bversion_cmp(acc->k.data[idx].version, k.k->version) >= 0; + bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0; if (applied) continue; @@ -644,7 +644,7 @@ int bch2_accounting_read(struct bch_fs *c) if (i + 1 < &darray_top(*keys) && i[1].k->k.type == KEY_TYPE_accounting && !journal_key_cmp(i, i + 1)) { - BUG_ON(bversion_cmp(i[0].k->k.version, i[1].k->k.version) >= 0); + BUG_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0); i[1].journal_seq = i[0].journal_seq; diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 243bfb0975eab..4ea6c8a092bc1 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -36,8 +36,8 @@ static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst, for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++) dst->v.d[i] += src.v->d[i]; - if (bversion_cmp(dst->k.version, src.k->version) < 0) - dst->k.version = src.k->version; + if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0) + dst->k.bversion = src.k->bversion; } static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h index 1687a45177a7f..b1982131b2066 100644 --- a/fs/bcachefs/disk_accounting_types.h +++ b/fs/bcachefs/disk_accounting_types.h @@ -6,7 +6,7 @@ struct accounting_mem_entry { struct bpos pos; - struct bversion version; + struct bversion bversion; unsigned nr_counters; u64 __percpu *v[2]; }; diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index b2f50e74bb76e..e4fc17c548fd5 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -517,7 +517,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, if ((ret = bkey_err(k))) goto out; - if (bversion_cmp(k.k->version, rbio->version) || + if (bversion_cmp(k.k->bversion, rbio->version) || !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) goto out; @@ -1031,7 +1031,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, rbio->read_pos = read_pos; rbio->data_btree = data_btree; rbio->data_pos = data_pos; - rbio->version = k.k->version; + rbio->version = k.k->bversion; rbio->promote = promote; INIT_WORK(&rbio->work, NULL); diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index d3b5be7fd9bf6..b5fe9e0dc1557 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -697,7 +697,7 @@ static void init_append_extent(struct bch_write_op *op, e = bkey_extent_init(op->insert_keys.top); e->k.p = op->pos; e->k.size = crc.uncompressed_size; - e->k.version = version; + e->k.bversion = version; if (crc.csum_type || crc.compression_type || @@ -1544,7 +1544,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) id = bkey_inline_data_init(op->insert_keys.top); id->k.p = op->pos; - id->k.version = op->version; + id->k.bversion = op->version; id->k.size = sectors; iter = bio->bi_iter; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index d1699aecc9fb6..c84295a072320 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -151,7 +151,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans, struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u); /* Has this delta already been applied to the btree? */ - if (bversion_cmp(old.k->version, k->k->k.version) >= 0) { + if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) { ret = 0; goto out; } diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index e59c0abb47723..f457925fa362e 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -367,7 +367,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, r_v->k.type = bkey_type_to_indirect(&orig->k); r_v->k.p = reflink_iter.pos; bch2_key_resize(&r_v->k, orig->k.size); - r_v->k.version = orig->k.version; + r_v->k.bversion = orig->k.bversion; set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 01b768c9b7675..b2f209743afeb 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -394,7 +394,7 @@ static int insert_test_extent(struct bch_fs *c, k.k_i.k.p.offset = end; k.k_i.k.p.snapshot = U32_MAX; k.k_i.k.size = end - start; - k.k_i.k.version.lo = test_version++; + k.k_i.k.bversion.lo = test_version++; ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0); bch_err_fn(c, ret); From f8911ad88de3acea7a67451f59649bb54da0741b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 26 Sep 2024 15:58:02 -0400 Subject: [PATCH 331/352] bcachefs: Check for accounting keys with bversion=0 Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey.h | 4 ++-- fs/bcachefs/disk_accounting.c | 4 ++++ fs/bcachefs/sb-errors_format.h | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 6e5092d4c62eb..41df24a53d971 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -214,9 +214,9 @@ static __always_inline int bversion_cmp(struct bversion l, struct bversion r) #define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) #define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) -static __always_inline int bversion_zero(struct bversion v) +static __always_inline bool bversion_zero(struct bversion v) { - return !bversion_cmp(v, ZERO_VERSION); + return bversion_cmp(v, ZERO_VERSION) == 0; } #ifdef CONFIG_BCACHEFS_DEBUG diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 9ac45a607b931..59897b347c62a 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -134,6 +134,10 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, void *end = &acc_k + 1; int ret = 0; + bkey_fsck_err_on(bversion_zero(k.k->bversion), + c, accounting_key_version_0, + "accounting key with version=0"); + switch (acc_k.type) { case BCH_DISK_ACCOUNTING_nr_inodes: end = field_end(acc_k, nr_inodes); diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 3fca1d836318a..6955bb4ea4c52 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -293,7 +293,8 @@ enum bch_fsck_flags { x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \ x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ - x(MAX, 282, 0) + x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ + x(MAX, 283, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, From a3581ca35d2b7e854e071dec2df7de7152aaa5c3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 26 Sep 2024 15:59:29 -0400 Subject: [PATCH 332/352] bcachefs: Fix BCH_TRANS_COMMIT_skip_accounting_apply This was added to avoid double-counting accounting keys in journal replay. But applied incorrectly (easily done since it applies to the transaction commit, not a particular update), it leads to skipping in-mem accounting for real accounting updates, and failure to give them a version number - which leads to journal replay becoming very confused the next time around. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 36 ++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 0989e2da6e56d..1a74a1a252ee2 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -700,27 +700,31 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, struct jset_entry *entry = trans->journal_entries; - if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { - percpu_down_read(&c->mark_lock); - - for (entry = trans->journal_entries; - entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - entry = vstruct_next(entry)) - if (jset_entry_is_key(entry) && entry->start->k.type == KEY_TYPE_accounting) { - struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start); - - a->k.bversion = journal_pos_to_bversion(&trans->journal_res, - (u64 *) entry - (u64 *) trans->journal_entries); - BUG_ON(bversion_zero(a->k.bversion)); + percpu_down_read(&c->mark_lock); + + for (entry = trans->journal_entries; + entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + entry = vstruct_next(entry)) + if (entry->type == BCH_JSET_ENTRY_write_buffer_keys && + entry->start->k.type == KEY_TYPE_accounting) { + BUG_ON(!trans->journal_res.ref); + + struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start); + + a->k.bversion = journal_pos_to_bversion(&trans->journal_res, + (u64 *) entry - (u64 *) trans->journal_entries); + BUG_ON(bversion_zero(a->k.bversion)); + + if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal); if (ret) goto revert_fs_usage; } - percpu_up_read(&c->mark_lock); + } + percpu_up_read(&c->mark_lock); - /* XXX: we only want to run this if deltas are nonzero */ - bch2_trans_account_disk_usage_change(trans); - } + /* XXX: we only want to run this if deltas are nonzero */ + bch2_trans_account_disk_usage_change(trans); trans_for_each_update(trans, i) if (btree_node_type_has_atomic_triggers(i->bkey_type)) { From 9773547b16b1e1b27f002733623cd0e8e6d0f69c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 27 Sep 2024 21:05:59 -0400 Subject: [PATCH 333/352] bcachefs: Convert disk accounting BUG_ON() to WARN_ON() We had a bug where disk accounting keys didn't always have their version field set in journal replay; change the BUG_ON() to a WARN(), and exclude this case since it's now checked for elsewhere (in the bkey validate function). Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 59897b347c62a..9f3133e3e7e5e 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -648,7 +648,7 @@ int bch2_accounting_read(struct bch_fs *c) if (i + 1 < &darray_top(*keys) && i[1].k->k.type == KEY_TYPE_accounting && !journal_key_cmp(i, i + 1)) { - BUG_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0); + WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0); i[1].journal_seq = i[0].journal_seq; From 1c0ee43b2c9057473e551e2464f24f717accabf6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 26 Sep 2024 16:19:58 -0400 Subject: [PATCH 334/352] bcachefs: BCH_FS_clean_recovery Add a filesystem flag to indicate whether we did a clean recovery - using c->sb.clean after we've got rw is incorrect, since c->sb is updated whenever we write the superblock. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/fsck.c | 6 ++++-- fs/bcachefs/inode.c | 2 +- fs/bcachefs/recovery.c | 2 ++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index e6dd4812c0203..f4151ee51b034 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -594,6 +594,7 @@ struct bch_dev { #define BCH_FS_FLAGS() \ x(new_fs) \ x(started) \ + x(clean_recovery) \ x(btree_running) \ x(accounting_replay_done) \ x(may_go_rw) \ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 10e2930b62da3..9c10eeeb0bcb8 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1084,8 +1084,9 @@ static int check_inode(struct btree_trans *trans, } } + /* i_size_dirty is vestigal, since we now have logged ops for truncate * */ if (u.bi_flags & BCH_INODE_i_size_dirty && - (!c->sb.clean || + (!test_bit(BCH_FS_clean_recovery, &c->flags) || fsck_err(trans, inode_i_size_dirty_but_clean, "filesystem marked clean, but inode %llu has i_size dirty", u.bi_inum))) { @@ -1114,8 +1115,9 @@ static int check_inode(struct btree_trans *trans, do_update = true; } + /* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */ if (u.bi_flags & BCH_INODE_i_sectors_dirty && - (!c->sb.clean || + (!test_bit(BCH_FS_clean_recovery, &c->flags) || fsck_err(trans, inode_i_sectors_dirty_but_clean, "filesystem marked clean, but inode %llu has i_sectors dirty", u.bi_inum))) { diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 1116db239708f..753c208896c3b 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1113,7 +1113,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, pos.offset, pos.snapshot)) goto delete; - if (c->sb.clean && + if (test_bit(BCH_FS_clean_recovery, &c->flags) && !fsck_err(trans, deleted_inode_but_clean, "filesystem marked as clean but have deleted inode %llu:%u", pos.offset, pos.snapshot)) { diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index c84295a072320..6db72d3bad7db 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -717,6 +717,8 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.fsck) set_bit(BCH_FS_fsck_running, &c->flags); + if (c->sb.clean) + set_bit(BCH_FS_clean_recovery, &c->flags); ret = bch2_blacklist_table_initialize(c); if (ret) { From d50d7a5fa4df3190b6b6c6d6551b631fda4a4ed2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 26 Sep 2024 16:23:30 -0400 Subject: [PATCH 335/352] bcachefs: Check for logged ops when clean If we shut down successfully, there shouldn't be any logged ops to resume. Signed-off-by: Kent Overstreet --- fs/bcachefs/logged_ops.c | 13 +++++++++++-- fs/bcachefs/sb-errors_format.h | 3 ++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index f49fdca1d07dc..6f4a4e1083c95 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -37,6 +37,14 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type); struct bkey_buf sk; u32 restart_count = trans->restart_count; + struct printbuf buf = PRINTBUF; + int ret = 0; + + fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags), + trans, logged_op_but_clean, + "filesystem marked as clean but have logged op\n%s", + (bch2_bkey_val_to_text(&buf, c, k), + buf.buf)); if (!fn) return 0; @@ -47,8 +55,9 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, fn->resume(trans, sk.k); bch2_bkey_buf_exit(&sk, c); - - return trans_was_restarted(trans, restart_count); +fsck_err: + printbuf_exit(&buf); + return ret ?: trans_was_restarted(trans, restart_count); } int bch2_resume_logged_ops(struct bch_fs *c) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 6955bb4ea4c52..c15c52ebf699b 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -294,7 +294,8 @@ enum bch_fsck_flags { x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ - x(MAX, 283, 0) + x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ + x(MAX, 284, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, From e057a290ef715d2765560778625e1660b7352994 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Tue, 27 Aug 2024 23:14:48 +0800 Subject: [PATCH 336/352] bcachefs: Fix lost wake up If the reader acquires the read lock and then the writer enters the slow path, while the reader proceeds to the unlock path, the following scenario can occur without the change: writer: pcpu_read_count(lock) return 1 (so __do_six_trylock will return 0) reader: this_cpu_dec(*lock->readers) reader: smp_mb() reader: state = atomic_read(&lock->state) (there is no waiting flag set) writer: six_set_bitmask() then the writer will sleep forever. Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/six.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index 3a494c5d12478..b3583c50ef04e 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -169,11 +169,17 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, ret = -1 - SIX_LOCK_write; } } else if (type == SIX_LOCK_write && lock->readers) { - if (try) { + if (try) atomic_add(SIX_LOCK_HELD_write, &lock->state); - smp_mb__after_atomic(); - } + /* + * Make sure atomic_add happens before pcpu_read_count and + * six_set_bitmask in slow path happens before pcpu_read_count. + * + * Paired with the smp_mb() in read lock fast path (per-cpu mode) + * and the one before atomic_read in read unlock path. + */ + smp_mb(); ret = !pcpu_read_count(lock); if (try && !ret) { From a6508079b1b6b231d16c438c384d718d3508573c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 22:22:00 -0400 Subject: [PATCH 337/352] bcachefs: dirent_points_to_inode() now warns on mismatch if an inode backpointer points to a dirent that doesn't point back, that's an error we should warn about. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 84 ++++++++++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 28 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 9c10eeeb0bcb8..0e397d0ef3dd6 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -21,6 +21,49 @@ #include #include /* struct qstr */ +static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static bool dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, + struct bch_inode_unpacked *inode) +{ + if (d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol + : le64_to_cpu(d.v->d_inum) == inode->bi_inum) + return 0; + return -BCH_ERR_ENOENT_dirent_doesnt_match_inode; +} + +static void dirent_inode_mismatch_msg(struct printbuf *out, + struct bch_fs *c, + struct bkey_s_c_dirent dirent, + struct bch_inode_unpacked *inode) +{ + prt_str(out, "inode points to dirent that does not point back:"); + prt_newline(out); + bch2_bkey_val_to_text(out, c, dirent.s_c); + prt_newline(out); + bch2_inode_unpacked_to_text(out, inode); +} + +static int dirent_points_to_inode(struct bch_fs *c, + struct bkey_s_c_dirent dirent, + struct bch_inode_unpacked *inode) +{ + int ret = dirent_points_to_inode_nowarn(dirent, inode); + if (ret) { + struct printbuf buf = PRINTBUF; + dirent_inode_mismatch_msg(&buf, c, dirent, inode); + bch_warn(c, "%s", buf.buf); + printbuf_exit(&buf); + } + return ret; +} + /* * XXX: this is handling transaction restarts without returning * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: @@ -371,7 +414,8 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume return ret; ret = remove_backpointer(trans, &inode); - bch_err_msg(c, ret, "removing dirent"); + if (!bch2_err_matches(ret, ENOENT)) + bch_err_msg(c, ret, "removing dirent"); if (ret) return ret; @@ -900,21 +944,6 @@ static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); } -static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - -static bool dirent_points_to_inode(struct bkey_s_c_dirent d, - struct bch_inode_unpacked *inode) -{ - return d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol - : le64_to_cpu(d.v->d_inum) == inode->bi_inum; -} - static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) { struct btree_iter iter; @@ -924,13 +953,14 @@ static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) return ret; } -static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k, +static int check_inode_dirent_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode, - u32 inode_snapshot, bool *write_inode) + bool *write_inode) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; + u32 inode_snapshot = inode->bi_snapshot; struct btree_iter dirent_iter = {}; struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); int ret = bkey_err(d); @@ -940,13 +970,13 @@ static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c i if (fsck_err_on(ret, trans, inode_points_to_missing_dirent, "inode points to missing dirent\n%s", - (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) || - fsck_err_on(!ret && !dirent_points_to_inode(d, inode), + (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || + fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode), trans, inode_points_to_wrong_dirent, - "inode points to dirent that does not point back:\n%s", - (bch2_bkey_val_to_text(&buf, c, inode_k), - prt_newline(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + "%s", + (printbuf_reset(&buf), + dirent_inode_mismatch_msg(&buf, c, d, inode), + buf.buf))) { /* * We just clear the backpointer fields for now. If we find a * dirent that points to this inode in check_dirents(), we'll @@ -1145,7 +1175,7 @@ static int check_inode(struct btree_trans *trans, } if (u.bi_dir || u.bi_dir_offset) { - ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update); + ret = check_inode_dirent_inode(trans, &u, &do_update); if (ret) goto err; } @@ -2474,10 +2504,8 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino if (ret && !bch2_err_matches(ret, ENOENT)) break; - if (!ret && !dirent_points_to_inode(d, &inode)) { + if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) bch2_trans_iter_exit(trans, &dirent_iter); - ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; - } if (bch2_err_matches(ret, ENOENT)) { ret = 0; From 0b0f0ad93c0833ed3b5457eb308274f340535988 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 22:27:13 -0400 Subject: [PATCH 338/352] bcachefs: remove_backpointer() now checks if dirent points to inode Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 0e397d0ef3dd6..8f52911558a77 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -389,14 +389,17 @@ static int reattach_inode(struct btree_trans *trans, static int remove_backpointer(struct btree_trans *trans, struct bch_inode_unpacked *inode) { - struct btree_iter iter; - struct bkey_s_c_dirent d; - int ret; + if (!inode->bi_dir) + return 0; - d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, - POS(inode->bi_dir, inode->bi_dir_offset), 0, + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c_dirent d = + bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, + SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot), 0, dirent); - ret = bkey_err(d) ?: + int ret = bkey_err(d) ?: + dirent_points_to_inode(c, d, inode) ?: __remove_dirent(trans, d.k->p); bch2_trans_iter_exit(trans, &iter); return ret; From 3a5895e3ac2bb4b252a4e816575eeec6ac3deeec Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 22:32:47 -0400 Subject: [PATCH 339/352] bcachefs: check_subvol_path() now prints subvol root inode Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 32 +++++++++++++------------------- fs/bcachefs/sb-errors_format.h | 2 +- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 8f52911558a77..0d8b782b63fb6 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2371,22 +2371,6 @@ static bool darray_u32_has(darray_u32 *d, u32 v) return false; } -/* - * We've checked that inode backpointers point to valid dirents; here, it's - * sufficient to check that the subvolume root has a dirent: - */ -static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s) -{ - struct bch_inode_unpacked inode; - int ret = bch2_inode_find_by_inum_trans(trans, - (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, - &inode); - if (ret) - return ret; - - return inode.bi_dir != 0; -} - static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { struct bch_fs *c = trans->c; @@ -2405,14 +2389,24 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - ret = subvol_has_dirent(trans, s); - if (ret < 0) + struct bch_inode_unpacked subvol_root; + ret = bch2_inode_find_by_inum_trans(trans, + (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, + &subvol_root); + if (ret) break; - if (fsck_err_on(!ret, + /* + * We've checked that inode backpointers point to valid dirents; + * here, it's sufficient to check that the subvolume root has a + * dirent: + */ + if (fsck_err_on(!subvol_root.bi_dir, trans, subvol_unreachable, "unreachable subvolume %s", (bch2_bkey_val_to_text(&buf, c, s.s_c), + prt_newline(&buf), + bch2_inode_unpacked_to_text(&buf, &subvol_root), buf.buf))) { ret = reattach_subvol(trans, s); break; diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index c15c52ebf699b..ed5dca5e11617 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -271,7 +271,7 @@ enum bch_fsck_flags { x(subvol_children_not_set, 256, 0) \ x(subvol_children_bad, 257, 0) \ x(subvol_loop, 258, 0) \ - x(subvol_unreachable, 259, 0) \ + x(subvol_unreachable, 259, FSCK_AUTOFIX) \ x(btree_node_bkey_bad_u64s, 260, 0) \ x(btree_node_topology_empty_interior_node, 261, 0) \ x(btree_ptr_v2_min_key_bad, 262, 0) \ From 716bf84ef39218a56fadaa413f70da008ad85888 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 11 Sep 2024 09:09:18 +0800 Subject: [PATCH 340/352] coccinelle: Add rules to find str_true_false() replacements After str_true_false() has been introduced in the tree, we can add rules for finding places where str_true_false() can be used. A simple test can find over 10 locations. Signed-off-by: Hongbo Li Signed-off-by: Julia Lawall --- scripts/coccinelle/api/string_choices.cocci | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/scripts/coccinelle/api/string_choices.cocci b/scripts/coccinelle/api/string_choices.cocci index 5e729f187f22e..6942ad7c42244 100644 --- a/scripts/coccinelle/api/string_choices.cocci +++ b/scripts/coccinelle/api/string_choices.cocci @@ -85,3 +85,22 @@ e << str_down_up_r.E; @@ coccilib.report.print_report(p[0], "opportunity for str_down_up(%s)" % e) + +@str_true_false depends on patch@ +expression E; +@@ +- ((E) ? "true" : "false") ++ str_true_false(E) + +@str_true_false_r depends on !patch exists@ +expression E; +position P; +@@ +* ((E@P) ? "true" : "false") + +@script:python depends on report@ +p << str_true_false_r.P; +e << str_true_false_r.E; +@@ + +coccilib.report.print_report(p[0], "opportunity for str_true_false(%s)" % e) From 8a0236bab4d6f0761f29eae4d55d1ba32c1ecd47 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 11 Sep 2024 09:09:19 +0800 Subject: [PATCH 341/352] coccinelle: Add rules to find str_false_true() replacements As done with str_true_false(), add checks for str_false_true() opportunities. A simple test can find over 9 cases currently exist in the tree. Signed-off-by: Hongbo Li Signed-off-by: Julia Lawall --- scripts/coccinelle/api/string_choices.cocci | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/scripts/coccinelle/api/string_choices.cocci b/scripts/coccinelle/api/string_choices.cocci index 6942ad7c42244..c3c5bc94fab0b 100644 --- a/scripts/coccinelle/api/string_choices.cocci +++ b/scripts/coccinelle/api/string_choices.cocci @@ -104,3 +104,22 @@ e << str_true_false_r.E; @@ coccilib.report.print_report(p[0], "opportunity for str_true_false(%s)" % e) + +@str_false_true depends on patch@ +expression E; +@@ +- ((E) ? "false" : "true") ++ str_false_true(E) + +@str_false_true_r depends on !patch exists@ +expression E; +position P; +@@ +* ((E@P) ? "false" : "true") + +@script:python depends on report@ +p << str_false_true_r.P; +e << str_false_true_r.E; +@@ + +coccilib.report.print_report(p[0], "opportunity for str_false_true(%s)" % e) From d4c7544002db32dedbb162b2b89e655ac437ab08 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 11 Sep 2024 09:09:20 +0800 Subject: [PATCH 342/352] coccinelle: Add rules to find str_hi{gh}_lo{w}() replacements As other rules done, we add rules for str_hi{gh}_lo{w}() to check the relative opportunities. Signed-off-by: Hongbo Li Signed-off-by: Julia Lawall --- scripts/coccinelle/api/string_choices.cocci | 42 +++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/scripts/coccinelle/api/string_choices.cocci b/scripts/coccinelle/api/string_choices.cocci index c3c5bc94fab0b..7c631fd8abe2b 100644 --- a/scripts/coccinelle/api/string_choices.cocci +++ b/scripts/coccinelle/api/string_choices.cocci @@ -123,3 +123,45 @@ e << str_false_true_r.E; @@ coccilib.report.print_report(p[0], "opportunity for str_false_true(%s)" % e) + +@str_hi_lo depends on patch@ +expression E; +@@ +( +- ((E) ? "hi" : "lo") ++ str_hi_lo(E) +) + +@str_hi_lo_r depends on !patch exists@ +expression E; +position P; +@@ +( +* ((E@P) ? "hi" : "lo") +) + +@script:python depends on report@ +p << str_hi_lo_r.P; +e << str_hi_lo_r.E; +@@ + +coccilib.report.print_report(p[0], "opportunity for str_hi_lo(%s)" % e) + +@str_high_low depends on patch@ +expression E; +@@ +- ((E) ? "high" : "low") ++ str_high_low(E) + +@str_high_low_r depends on !patch exists@ +expression E; +position P; +@@ +* ((E@P) ? "high" : "low") + +@script:python depends on report@ +p << str_high_low_r.P; +e << str_high_low_r.E; +@@ + +coccilib.report.print_report(p[0], "opportunity for str_high_low(%s)" % e) From 5b7ca4507d64de729011c145d8ee9a2731f9502f Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 11 Sep 2024 09:09:21 +0800 Subject: [PATCH 343/352] coccinelle: Add rules to find str_lo{w}_hi{gh}() replacements As other rules done, we add rules for str_lo{w}_hi{gh}() to check the relative opportunities. Signed-off-by: Hongbo Li Signed-off-by: Julia Lawall --- scripts/coccinelle/api/string_choices.cocci | 38 +++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/scripts/coccinelle/api/string_choices.cocci b/scripts/coccinelle/api/string_choices.cocci index 7c631fd8abe2b..e91d4eb30161c 100644 --- a/scripts/coccinelle/api/string_choices.cocci +++ b/scripts/coccinelle/api/string_choices.cocci @@ -165,3 +165,41 @@ e << str_high_low_r.E; @@ coccilib.report.print_report(p[0], "opportunity for str_high_low(%s)" % e) + +@str_lo_hi depends on patch@ +expression E; +@@ +- ((E) ? "lo" : "hi") ++ str_lo_hi(E) + +@str_lo_hi_r depends on !patch exists@ +expression E; +position P; +@@ +* ((E@P) ? "lo" : "hi") + +@script:python depends on report@ +p << str_lo_hi_r.P; +e << str_lo_hi_r.E; +@@ + +coccilib.report.print_report(p[0], "opportunity for str_lo_hi(%s)" % e) + +@str_low_high depends on patch@ +expression E; +@@ +- ((E) ? "low" : "high") ++ str_low_high(E) + +@str_low_high_r depends on !patch exists@ +expression E; +position P; +@@ +* ((E@P) ? "low" : "high") + +@script:python depends on report@ +p << str_low_high_r.P; +e << str_low_high_r.E; +@@ + +coccilib.report.print_report(p[0], "opportunity for str_low_high(%s)" % e) From dd2275d349c2f02ceb6cd37f89b8b9920c602488 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 11 Sep 2024 09:09:22 +0800 Subject: [PATCH 344/352] coccinelle: Add rules to find str_enable{d}_disable{d}() replacements As other rules done, we add rules for str_enable{d}_ disable{d}() to check the relative opportunities. Signed-off-by: Hongbo Li Signed-off-by: Julia Lawall --- scripts/coccinelle/api/string_choices.cocci | 38 +++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/scripts/coccinelle/api/string_choices.cocci b/scripts/coccinelle/api/string_choices.cocci index e91d4eb30161c..bb6b851ee2aa4 100644 --- a/scripts/coccinelle/api/string_choices.cocci +++ b/scripts/coccinelle/api/string_choices.cocci @@ -203,3 +203,41 @@ e << str_low_high_r.E; @@ coccilib.report.print_report(p[0], "opportunity for str_low_high(%s)" % e) + +@str_enable_disable depends on patch@ +expression E; +@@ +- ((E) ? "enable" : "disable") ++ str_enable_disable(E) + +@str_enable_disable_r depends on !patch exists@ +expression E; +position P; +@@ +* ((E@P) ? "enable" : "disable") + +@script:python depends on report@ +p << str_enable_disable_r.P; +e << str_enable_disable_r.E; +@@ + +coccilib.report.print_report(p[0], "opportunity for str_enable_disable(%s)" % e) + +@str_enabled_disabled depends on patch@ +expression E; +@@ +- ((E) ? "enabled" : "disabled") ++ str_enabled_disabled(E) + +@str_enabled_disabled_r depends on !patch exists@ +expression E; +position P; +@@ +* ((E@P) ? "enabled" : "disabled") + +@script:python depends on report@ +p << str_enabled_disabled_r.P; +e << str_enabled_disabled_r.E; +@@ + +coccilib.report.print_report(p[0], "opportunity for str_enabled_disabled(%s)" % e) From ba4b514a6f4ac420d872b8f16245e3ffa05c10a5 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 11 Sep 2024 09:09:23 +0800 Subject: [PATCH 345/352] coccinelle: Add rules to find str_read_write() replacements As other rules done, we add rules for str_read_write() to check the relative opportunities. Signed-off-by: Hongbo Li Signed-off-by: Julia Lawall --- scripts/coccinelle/api/string_choices.cocci | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/scripts/coccinelle/api/string_choices.cocci b/scripts/coccinelle/api/string_choices.cocci index bb6b851ee2aa4..29c507d86ffde 100644 --- a/scripts/coccinelle/api/string_choices.cocci +++ b/scripts/coccinelle/api/string_choices.cocci @@ -241,3 +241,22 @@ e << str_enabled_disabled_r.E; @@ coccilib.report.print_report(p[0], "opportunity for str_enabled_disabled(%s)" % e) + +@str_read_write depends on patch@ +expression E; +@@ +- ((E) ? "read" : "write") ++ str_read_write(E) + +@str_read_write_r depends on !patch exists@ +expression E; +position P; +@@ +* ((E@P) ? "read" : "write") + +@script:python depends on report@ +p << str_read_write_r.P; +e << str_read_write_r.E; +@@ + +coccilib.report.print_report(p[0], "opportunity for str_read_write(%s)" % e) From c81ca023c30691dcae543bd770e7a3a4c63263ff Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 11 Sep 2024 09:09:24 +0800 Subject: [PATCH 346/352] coccinelle: Add rules to find str_write_read() replacements As other rules done, we add rules for str_write_read() to check the relative opportunities. Signed-off-by: Hongbo Li Signed-off-by: Julia Lawall --- scripts/coccinelle/api/string_choices.cocci | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/scripts/coccinelle/api/string_choices.cocci b/scripts/coccinelle/api/string_choices.cocci index 29c507d86ffde..64f04da2e9f4d 100644 --- a/scripts/coccinelle/api/string_choices.cocci +++ b/scripts/coccinelle/api/string_choices.cocci @@ -260,3 +260,22 @@ e << str_read_write_r.E; @@ coccilib.report.print_report(p[0], "opportunity for str_read_write(%s)" % e) + +@str_write_read depends on patch@ +expression E; +@@ +- ((E) ? "write" : "read") ++ str_write_read(E) + +@str_write_read_r depends on !patch exists@ +expression E; +position P; +@@ +* ((E@P) ? "write" : "read") + +@script:python depends on report@ +p << str_write_read_r.P; +e << str_write_read_r.E; +@@ + +coccilib.report.print_report(p[0], "opportunity for str_write_read(%s)" % e) From 9b5b4810559d3716aef9fdc8d555e4db1a031749 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 11 Sep 2024 09:09:25 +0800 Subject: [PATCH 347/352] coccinelle: Add rules to find str_on_off() replacements As other rules done, we add rules for str_on_off() to check the relative opportunities. Signed-off-by: Hongbo Li Signed-off-by: Julia Lawall --- scripts/coccinelle/api/string_choices.cocci | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/scripts/coccinelle/api/string_choices.cocci b/scripts/coccinelle/api/string_choices.cocci index 64f04da2e9f4d..34ff8f48965ce 100644 --- a/scripts/coccinelle/api/string_choices.cocci +++ b/scripts/coccinelle/api/string_choices.cocci @@ -279,3 +279,22 @@ e << str_write_read_r.E; @@ coccilib.report.print_report(p[0], "opportunity for str_write_read(%s)" % e) + +@str_on_off depends on patch@ +expression E; +@@ +- ((E) ? "on" : "off") ++ str_on_off(E) + +@str_on_off_r depends on !patch exists@ +expression E; +position P; +@@ +* ((E@P) ? "on" : "off") + +@script:python depends on report@ +p << str_on_off_r.P; +e << str_on_off_r.E; +@@ + +coccilib.report.print_report(p[0], "opportunity for str_on_off(%s)" % e) From 253244cdf16a755039f9078b0a785176712f2584 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 11 Sep 2024 09:09:26 +0800 Subject: [PATCH 348/352] coccinelle: Add rules to find str_yes_no() replacements As other rules done, we add rules for str_yes_no() to check the relative opportunities. Signed-off-by: Hongbo Li Signed-off-by: Julia Lawall --- scripts/coccinelle/api/string_choices.cocci | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/scripts/coccinelle/api/string_choices.cocci b/scripts/coccinelle/api/string_choices.cocci index 34ff8f48965ce..96dc7090395d7 100644 --- a/scripts/coccinelle/api/string_choices.cocci +++ b/scripts/coccinelle/api/string_choices.cocci @@ -298,3 +298,22 @@ e << str_on_off_r.E; @@ coccilib.report.print_report(p[0], "opportunity for str_on_off(%s)" % e) + +@str_yes_no depends on patch@ +expression E; +@@ +- ((E) ? "yes" : "no") ++ str_yes_no(E) + +@str_yes_no_r depends on !patch exists@ +expression E; +position P; +@@ +* ((E@P) ? "yes" : "no") + +@script:python depends on report@ +p << str_yes_no_r.P; +e << str_yes_no_r.E; +@@ + +coccilib.report.print_report(p[0], "opportunity for str_yes_no(%s)" % e) From f584e3752ca7bb1f8849a85816b3c974f1aa67ec Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 11 Sep 2024 09:09:27 +0800 Subject: [PATCH 349/352] coccinelle: Remove unnecessary parentheses for only one possible change. The parentheses are only needed if there is a disjunction, ie a set of possible changes. If there is only one pattern, we can remove these parentheses. Just like the format: - x + y not: ( - x + y ) Signed-off-by: Hongbo Li Signed-off-by: Julia Lawall --- scripts/coccinelle/api/string_choices.cocci | 8 -------- 1 file changed, 8 deletions(-) diff --git a/scripts/coccinelle/api/string_choices.cocci b/scripts/coccinelle/api/string_choices.cocci index 96dc7090395d7..95e9a3b31f86c 100644 --- a/scripts/coccinelle/api/string_choices.cocci +++ b/scripts/coccinelle/api/string_choices.cocci @@ -43,18 +43,14 @@ coccilib.report.print_report(p[0], "opportunity for str_plural(%s)" % e) @str_up_down depends on patch@ expression E; @@ -( - ((E) ? "up" : "down") + str_up_down(E) -) @str_up_down_r depends on !patch exists@ expression E; position P; @@ -( * ((E@P) ? "up" : "down") -) @script:python depends on report@ p << str_up_down_r.P; @@ -66,18 +62,14 @@ coccilib.report.print_report(p[0], "opportunity for str_up_down(%s)" % e) @str_down_up depends on patch@ expression E; @@ -( - ((E) ? "down" : "up") + str_down_up(E) -) @str_down_up_r depends on !patch exists@ expression E; position P; @@ -( * ((E@P) ? "down" : "up") -) @script:python depends on report@ p << str_down_up_r.P; From 4003ba664bd16f5a969cc883295a9eb5a5aef19e Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 28 Sep 2024 21:26:22 +0200 Subject: [PATCH 350/352] Reduce Coccinelle choices in string_choices.cocci The isomorphism neg_if_exp negates the test of a ?: conditional, making it unnecessary to have an explicit case for a negated test with the branches inverted. At the same time, we can disable neg_if_exp in cases where a different API function may be more suitable for a negated test. Finally, in the non-patch cases, E matches an expression with parentheses around it, so there is no need to mention () explicitly in the pattern. The () are still needed in the patch cases, because we want to drop them, if they are present. Signed-off-by: Julia Lawall --- scripts/coccinelle/api/string_choices.cocci | 91 ++++++++++----------- 1 file changed, 41 insertions(+), 50 deletions(-) diff --git a/scripts/coccinelle/api/string_choices.cocci b/scripts/coccinelle/api/string_choices.cocci index 95e9a3b31f86c..3750450869125 100644 --- a/scripts/coccinelle/api/string_choices.cocci +++ b/scripts/coccinelle/api/string_choices.cocci @@ -14,23 +14,18 @@ expression E; - ((E == 1) ? "" : "s") + str_plural(E) | -- ((E != 1) ? "s" : "") -+ str_plural(E) -| - ((E > 1) ? "s" : "") + str_plural(E) ) -@str_plural_r depends on !patch exists@ +@str_plural_r depends on !patch@ expression E; position P; @@ ( -* ((E@P == 1) ? "" : "s") +* (E@P == 1) ? "" : "s" | -* ((E@P != 1) ? "s" : "") -| -* ((E@P > 1) ? "s" : "") +* (E@P > 1) ? "s" : "" ) @script:python depends on report@ @@ -40,17 +35,17 @@ e << str_plural_r.E; coccilib.report.print_report(p[0], "opportunity for str_plural(%s)" % e) -@str_up_down depends on patch@ +@str_up_down depends on patch disable neg_if_exp@ expression E; @@ - ((E) ? "up" : "down") + str_up_down(E) -@str_up_down_r depends on !patch exists@ +@str_up_down_r depends on !patch disable neg_if_exp@ expression E; position P; @@ -* ((E@P) ? "up" : "down") +* E@P ? "up" : "down" @script:python depends on report@ p << str_up_down_r.P; @@ -59,17 +54,17 @@ e << str_up_down_r.E; coccilib.report.print_report(p[0], "opportunity for str_up_down(%s)" % e) -@str_down_up depends on patch@ +@str_down_up depends on patch disable neg_if_exp@ expression E; @@ - ((E) ? "down" : "up") + str_down_up(E) -@str_down_up_r depends on !patch exists@ +@str_down_up_r depends on !patch disable neg_if_exp@ expression E; position P; @@ -* ((E@P) ? "down" : "up") +* E@P ? "down" : "up" @script:python depends on report@ p << str_down_up_r.P; @@ -78,17 +73,17 @@ e << str_down_up_r.E; coccilib.report.print_report(p[0], "opportunity for str_down_up(%s)" % e) -@str_true_false depends on patch@ +@str_true_false depends on patch disable neg_if_exp@ expression E; @@ - ((E) ? "true" : "false") + str_true_false(E) -@str_true_false_r depends on !patch exists@ +@str_true_false_r depends on !patch disable neg_if_exp@ expression E; position P; @@ -* ((E@P) ? "true" : "false") +* E@P ? "true" : "false" @script:python depends on report@ p << str_true_false_r.P; @@ -97,17 +92,17 @@ e << str_true_false_r.E; coccilib.report.print_report(p[0], "opportunity for str_true_false(%s)" % e) -@str_false_true depends on patch@ +@str_false_true depends on patch disable neg_if_exp@ expression E; @@ - ((E) ? "false" : "true") + str_false_true(E) -@str_false_true_r depends on !patch exists@ +@str_false_true_r depends on !patch disable neg_if_exp@ expression E; position P; @@ -* ((E@P) ? "false" : "true") +* E@P ? "false" : "true" @script:python depends on report@ p << str_false_true_r.P; @@ -116,21 +111,17 @@ e << str_false_true_r.E; coccilib.report.print_report(p[0], "opportunity for str_false_true(%s)" % e) -@str_hi_lo depends on patch@ +@str_hi_lo depends on patch disable neg_if_exp@ expression E; @@ -( - ((E) ? "hi" : "lo") + str_hi_lo(E) -) -@str_hi_lo_r depends on !patch exists@ +@str_hi_lo_r depends on !patch disable neg_if_exp@ expression E; position P; @@ -( -* ((E@P) ? "hi" : "lo") -) +* E@P ? "hi" : "lo" @script:python depends on report@ p << str_hi_lo_r.P; @@ -139,17 +130,17 @@ e << str_hi_lo_r.E; coccilib.report.print_report(p[0], "opportunity for str_hi_lo(%s)" % e) -@str_high_low depends on patch@ +@str_high_low depends on patch disable neg_if_exp@ expression E; @@ - ((E) ? "high" : "low") + str_high_low(E) -@str_high_low_r depends on !patch exists@ +@str_high_low_r depends on !patch disable neg_if_exp@ expression E; position P; @@ -* ((E@P) ? "high" : "low") +* E@P ? "high" : "low" @script:python depends on report@ p << str_high_low_r.P; @@ -158,17 +149,17 @@ e << str_high_low_r.E; coccilib.report.print_report(p[0], "opportunity for str_high_low(%s)" % e) -@str_lo_hi depends on patch@ +@str_lo_hi depends on patch disable neg_if_exp@ expression E; @@ - ((E) ? "lo" : "hi") + str_lo_hi(E) -@str_lo_hi_r depends on !patch exists@ +@str_lo_hi_r depends on !patch disable neg_if_exp@ expression E; position P; @@ -* ((E@P) ? "lo" : "hi") +* E@P ? "lo" : "hi" @script:python depends on report@ p << str_lo_hi_r.P; @@ -177,17 +168,17 @@ e << str_lo_hi_r.E; coccilib.report.print_report(p[0], "opportunity for str_lo_hi(%s)" % e) -@str_low_high depends on patch@ +@str_low_high depends on patch disable neg_if_exp@ expression E; @@ - ((E) ? "low" : "high") + str_low_high(E) -@str_low_high_r depends on !patch exists@ +@str_low_high_r depends on !patch disable neg_if_exp@ expression E; position P; @@ -* ((E@P) ? "low" : "high") +* E@P ? "low" : "high" @script:python depends on report@ p << str_low_high_r.P; @@ -202,11 +193,11 @@ expression E; - ((E) ? "enable" : "disable") + str_enable_disable(E) -@str_enable_disable_r depends on !patch exists@ +@str_enable_disable_r depends on !patch@ expression E; position P; @@ -* ((E@P) ? "enable" : "disable") +* E@P ? "enable" : "disable" @script:python depends on report@ p << str_enable_disable_r.P; @@ -221,11 +212,11 @@ expression E; - ((E) ? "enabled" : "disabled") + str_enabled_disabled(E) -@str_enabled_disabled_r depends on !patch exists@ +@str_enabled_disabled_r depends on !patch@ expression E; position P; @@ -* ((E@P) ? "enabled" : "disabled") +* E@P ? "enabled" : "disabled" @script:python depends on report@ p << str_enabled_disabled_r.P; @@ -234,17 +225,17 @@ e << str_enabled_disabled_r.E; coccilib.report.print_report(p[0], "opportunity for str_enabled_disabled(%s)" % e) -@str_read_write depends on patch@ +@str_read_write depends on patch disable neg_if_exp@ expression E; @@ - ((E) ? "read" : "write") + str_read_write(E) -@str_read_write_r depends on !patch exists@ +@str_read_write_r depends on !patch disable neg_if_exp@ expression E; position P; @@ -* ((E@P) ? "read" : "write") +* E@P ? "read" : "write" @script:python depends on report@ p << str_read_write_r.P; @@ -253,17 +244,17 @@ e << str_read_write_r.E; coccilib.report.print_report(p[0], "opportunity for str_read_write(%s)" % e) -@str_write_read depends on patch@ +@str_write_read depends on patch disable neg_if_exp@ expression E; @@ - ((E) ? "write" : "read") + str_write_read(E) -@str_write_read_r depends on !patch exists@ +@str_write_read_r depends on !patch disable neg_if_exp@ expression E; position P; @@ -* ((E@P) ? "write" : "read") +* E@P ? "write" : "read" @script:python depends on report@ p << str_write_read_r.P; @@ -278,11 +269,11 @@ expression E; - ((E) ? "on" : "off") + str_on_off(E) -@str_on_off_r depends on !patch exists@ +@str_on_off_r depends on !patch@ expression E; position P; @@ -* ((E@P) ? "on" : "off") +* E@P ? "on" : "off" @script:python depends on report@ p << str_on_off_r.P; @@ -297,11 +288,11 @@ expression E; - ((E) ? "yes" : "no") + str_yes_no(E) -@str_yes_no_r depends on !patch exists@ +@str_yes_no_r depends on !patch@ expression E; position P; @@ -* ((E@P) ? "yes" : "no") +* E@P ? "yes" : "no" @script:python depends on report@ p << str_yes_no_r.P; From 3f749befb0998472470d850b11b430477c0718cc Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 Sep 2024 14:47:33 -0700 Subject: [PATCH 351/352] x86: kvm: fix build error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cpu_emergency_register_virt_callback() function is used unconditionally by the x86 kvm code, but it is declared (and defined) conditionally: #if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); ... leading to a build error when neither KVM_INTEL nor KVM_AMD support is enabled: arch/x86/kvm/x86.c: In function ‘kvm_arch_enable_virtualization’: arch/x86/kvm/x86.c:12517:9: error: implicit declaration of function ‘cpu_emergency_register_virt_callback’ [-Wimplicit-function-declaration] 12517 | cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/x86/kvm/x86.c: In function ‘kvm_arch_disable_virtualization’: arch/x86/kvm/x86.c:12522:9: error: implicit declaration of function ‘cpu_emergency_unregister_virt_callback’ [-Wimplicit-function-declaration] 12522 | cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix the build by defining empty helper functions the same way the old cpu_emergency_disable_virtualization() function was dealt with for the same situation. Maybe we could instead have made the call sites conditional, since the callers (kvm_arch_{en,dis}able_virtualization()) have an empty weak fallback. I'll leave that to the kvm people to argue about, this at least gets the build going for that particular config. Fixes: 590b09b1d88e ("KVM: x86: Register "emergency disable" callbacks when virt is enabled") Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Kai Huang Cc: Chao Gao Cc: Farrah Chen Signed-off-by: Linus Torvalds --- arch/x86/include/asm/reboot.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index d0ef2a678d66c..c02183d3cdd7e 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -31,6 +31,8 @@ void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_disable_virtualization(void); #else +static inline void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) {} +static inline void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) {} static inline void cpu_emergency_disable_virtualization(void) {} #endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ From 9852d85ec9d492ebef56dc5f229416c925758edc Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 Sep 2024 15:06:19 -0700 Subject: [PATCH 352/352] Linux 6.12-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 265dd990a9b65..187a4ce2728e9 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 -PATCHLEVEL = 11 +PATCHLEVEL = 12 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Baby Opossum Posse # *DOCUMENTATION*