From 70070d6dfe4f2e87af8a09c13e160f3d9c5af3f0 Mon Sep 17 00:00:00 2001
From: Flavio Ceolin <flavio.ceolin@intel.com>
Date: Sun, 19 Nov 2023 15:44:56 -0800
Subject: [PATCH] arch/xtensa: Add new MMU layer

Andy Ross re-implementation of MMU layer with some subtle changes,
like re-using existent macros, fix page table cache property when
direct mapping it in TLB.

From Andy's original commit message:

This is a reworked MMU layer, sitting cleanly below the page table
handling in the OS.  Notable differences from the original work:

+ Significantly smaller code and simpler API (just three functions to
  be called from the OS/userspace/ptable layer).

+ Big README-MMU document containing my learnings over the process, so
  hopefully fewer people need to go through this in the future.

+ No TLB flushing needed.  Clean separation of ASIDs, just requires
  that the upper levels match the ASID to the L1 page table page
  consistently.

+ Vector mapping is done with a 4k page and not a 4M page, leading to
  much more flexibility with hardware memory layout.  The original
  scheme required that the 4M region containing vecbase be mapped
  virtually to a location other than the hardware address, which makes
  confusing linkage with call0 and difficult initialization
  constraints where the exception vectors run at different addresses
  before and after MMU setup (effectively forcing them to be PIC
  code).

+ More provably correct initialization, all MMU changes happen in a
  single asm block with no memory accesses which would generate a
  refill.

Signed-off-by: Andy Ross <andyross@google.com>
Signed-off-by: Flavio Ceolin <flavio.ceolin@intel.com>
---
 arch/xtensa/core/CMakeLists.txt            |   2 +-
 arch/xtensa/core/README-MMU.txt            | 268 +++++++++++++++++++++
 arch/xtensa/core/include/xtensa_mmu_priv.h |  15 +-
 arch/xtensa/core/mmu.c                     | 178 ++++++++++++++
 arch/xtensa/core/ptables.c                 | 211 ++--------------
 arch/xtensa/include/xtensa-asm2-s.h        |  25 --
 soc/xtensa/dc233c/mmu.c                    |  30 ---
 7 files changed, 478 insertions(+), 251 deletions(-)
 create mode 100644 arch/xtensa/core/README-MMU.txt
 create mode 100644 arch/xtensa/core/mmu.c

diff --git a/arch/xtensa/core/CMakeLists.txt b/arch/xtensa/core/CMakeLists.txt
index 1e4b045085ec5b4..f3122c1a5504da3 100644
--- a/arch/xtensa/core/CMakeLists.txt
+++ b/arch/xtensa/core/CMakeLists.txt
@@ -21,7 +21,7 @@ zephyr_library_sources_ifdef(CONFIG_XTENSA_ENABLE_BACKTRACE debug_helpers_asm.S)
 zephyr_library_sources_ifdef(CONFIG_DEBUG_COREDUMP coredump.c)
 zephyr_library_sources_ifdef(CONFIG_TIMING_FUNCTIONS timing.c)
 zephyr_library_sources_ifdef(CONFIG_GDBSTUB gdbstub.c)
-zephyr_library_sources_ifdef(CONFIG_XTENSA_MMU ptables.c)
+zephyr_library_sources_ifdef(CONFIG_XTENSA_MMU ptables.c mmu.c)
 zephyr_library_sources_ifdef(CONFIG_USERSPACE userspace.S)
 zephyr_library_sources_ifdef(CONFIG_XTENSA_SYSCALL_USE_HELPER syscall_helper.c)
 
diff --git a/arch/xtensa/core/README-MMU.txt b/arch/xtensa/core/README-MMU.txt
new file mode 100644
index 000000000000000..499a251cdf2f2f4
--- /dev/null
+++ b/arch/xtensa/core/README-MMU.txt
@@ -0,0 +1,268 @@
+# Xtensa MMU Operation
+
+As with other elements of the architecture, paged virtual memory
+management on Xtensa is somewhat unique.  And there is similarly a
+lack of introductory material available.  This document is an attempt
+to introduce the architecture at an overview/tutorial level, and to
+describe Zephyr's specific implementation choices.
+
+## General TLB Operation
+
+The Xtensa MMU operates on top of a fairly conventional TLB cache.
+The TLB stores virtual to physical translation for individual pages of
+memory.  It is partitioned into an automatically managed
+4-way-set-associative bank of entries mapping 4k pages, and 3-6
+"special" ways storing mappings under OS control.  Some of these are
+for mapping pages larger than 4k, which Zephyr does not directly
+support.  A few are for bootstrap and initialization, and will be
+discussed below.
+
+Like the L1 cache, the TLB is split into separate instruction and data
+entries.  Zephyr manages both as needed, but symmetrically.  The
+architecture technically supports separately-virtualized instruction
+and data spaces, but the hardware page table refill mechanism (see
+below) does not, and Zephyr's memory spaces are unified regardless.
+
+The TLB may be loaded with permissions and attributes controlling
+cacheability, access control based on ring (i.e. the contents of the
+RING field of the PS register) and togglable write and execute access.
+Memory access, even with a matching TLB entry, may therefore create
+Kernel/User exceptions as desired to enforce permissions choices on
+userspace code.
+
+Live TLB entries are tagged with an 8-bit "ASID" value derived from
+their ring field of the PTE that loaded them, via a simple translation
+specified in the RASID special register.  The intent is that each
+non-kernel address space will get a separate ring 3 ASID set in RASID,
+such that you can switch between them without a TLB flush.  The ASID
+value of ring zero is fixed at 1, it may not be changed.  (An ASID
+value of zero is used to tag an invalid/unmapped TLB entry at
+initialization, but this mechanism isn't accessible to OS code except
+in special circumstances, and in any case there is already an invalid
+attribute value that can be used in a PTE).
+
+## Virtually-mapped Page Tables
+
+Xtensa has a unique (and, to someone exposed for the first time,
+extremely confusing) "page table" format.  The simplest was to begin
+to explain this is just to describe the (quite simple) hardware
+behavior:
+
+On a TLB miss, the hardware immediately does a single fetch (at ring 0
+privilege) from RAM by adding the "desired address right shifted by
+10 bits with the bottom two bits set to zero" (i.e. the page frame
+number in units of 4 bytes) to the value in the PTEVADDR special
+register.  If this load succeeds, then the word is treated as a PTE
+with which to fill the TLB and use for a (restarted) memory access.
+This is extremely simple (just one extra hardware state that does just
+one thing the hardware can already do), and quite fast (only one
+memory fetch vs. e.g. the 2-5 fetches required to walk a page table on
+x86).
+
+This special "refill" fetch is otherwise identical to any other memory
+access, meaning it too uses the TLB to translate from a virtual to
+physical address.  Which means that the page tables occupy a 4M region
+of virtual, not physical, address space, in the same memory space
+occupied by the running code.  The 1024 pages in that range (not all
+of which might be mapped in physical memory) are a linear array of
+1048576 4-byte PTE entries, each describing a mapping for 4k of
+virtual memory.  Note especially that exactly one of those pages
+contains the 1024 PTE entries for the 4M page table itself, pointed to
+by PTEVADDR.
+
+Obviously, the page table memory being virtual means that the fetch
+can fail: there are 1024 possible pages in a complete page table
+covering all of memory, and the ~16 entry TLB clearly won't contain
+entries mapping all of them.  If we are missing a TLB entry for the
+page translation we want (NOT for the original requested address, we
+already know we're missing that TLB entry), the hardware has exactly
+one more special trick: it throws a TLB Miss exception (there are two,
+one each for instruction/data TLBs, but in Zephyr they operate
+identically).
+
+The job of that exception handler is simply to ensure that the TLB has
+an entry for the page table page we want.  And the simplest way to do
+that is to just load the faulting PTE as an address, which will then
+go through the same refill process above.  This second TLB fetch in
+the exception handler may result in an invalid/inapplicable mapping
+within the 4M page table region.  This is an typical/expected runtime
+fault, and simply indicates unmapped memory.  The result is TLB miss
+exception from within the TLB miss exception handler (i.e. while the
+EXCM bit is set).  This will produce a Double Exception fault, which
+is handled by the OS identically to a general Kernel/User data access
+prohibited exception.
+
+After the TLB refill exception, the original faulting instruction is
+restarted, which retries the refill process, which succeeds in
+fetching a new TLB entry, which is then used to service the original
+memory access.  (And may then result in yet another exception if it
+turns out that the TLB entry doesn't permit the access requested, of
+course.)
+
+## Special Cases
+
+The page-tables-specified-in-virtual-memory trick works very well in
+practice.  But it does have a chicken/egg problem with the initial
+state.  Because everything depends on state in the TLB, something
+needs to tell the hardware how to find a physical address using the
+TLB to begin the process.  Here we exploit the separate
+non-automatically-refilled TLB ways to store bootstrap records.
+
+First, note that the refill process to load a PTE requires that the 4M
+space of PTE entries be resolvable by the TLB directly, without
+requiring another refill.  This 4M mapping is provided by a single
+page of PTE entries (which itself lives in the 4M page table region!).
+This page must always be in the TLB.
+
+Thankfully, for the data TLB Xtensa provides 3 special/non-refillable
+ways (ways 7-9) with at least one 4k page mapping each.  We can use
+one of these to "pin" the top-level page table entry in place,
+ensuring that a refill access will be able to find a PTE address.
+
+But now note that the load from that PTE address for the refill is
+done in an exception handler.  And running an exception handler
+requires doing a fetch via the instruction TLB.  And that obviously
+means that the page(s) containing the exception handler must never
+require a refill exception of its own.
+
+Ideally we would just pin the vector/handler page in the ITLB in the
+same way we do for data, but somewhat inexplicably, Xtensa does not
+provide 4k "pinnable" ways in the instruction TLB (frankly this seems
+like a design flaw).
+
+Instead, we load ITLB entries for vector handlers via the refill
+mechanism using the data TLB, and so need the refill mechanism for the
+vector page to succeed always.  The way to do this is to similarly pin
+the page table page containing the (single) PTE for the vector page in
+the data TLB, such that instruction fetches always find their TLB
+mapping via refill, without requiring an exception.
+
+## Initialization
+
+Unlike most other architectures, Xtensa does not have a "disable" mode
+for the MMU.  Virtual address translation through the TLB is active at
+all times.  There therefore needs to be a mechanism for the CPU to
+execute code before the OS is able to initialize a refillable page
+table.
+
+The way Xtensa resolves this (on the hardware Zephyr supports, see the
+note below) is to have an 8-entry set ("way 6") of 512M pages able to
+cover all of memory.  These 8 entries are initialized as valid, with
+attributes specifying that they are accessible only to an ASID of 1
+(i.e. the fixed ring zero / kernel ASID), writable, executable, and
+uncached.  So at boot the CPU relies on these TLB entries to provide a
+clean view of hardware memory.
+
+But that means that enabling page-level translation requires some
+care, as the CPU will throw an exception ("multi hit") if a memory
+access matches more than one live entry in the TLB.  The
+initialization algorithm is therefore:
+
+0. Start with a fully-initialized page table layout, including the
+   top-level "L1" page containing the mappings for the page table
+   itself.
+
+1. Ensure that the initialization routine does not cross a page
+   boundary (to prevent stray TLB refill exceptions), that it occupies
+   a separate 4k page than the exception vectors (which we must
+   temporarily double-map), and that it operates entirely in registers
+   (to avoid doing memory access at inopportune moments).
+
+2. Pin the L1 page table PTE into the data TLB.  This creates a double
+   mapping condition, but it is safe as nothing will use it until we
+   start refilling.
+
+3. Pin the page table page containing the PTE for the TLB miss
+   exception handler into the data TLB.  This will likewise not be
+   accessed until the double map condition is resolved.
+
+4. Set PTEVADDR appropriately.  The CPU state to handle refill
+   exceptions is now complete, but cannot be used until we resolve the
+   double mappings.
+
+5. Disable the initial/way6 data TLB entries first, by setting them to
+   an ASID of zero.  This is safe as the code being executed is not
+   doing data accesses yet (including refills), and will resolve the
+   double mapping conditions we created above.
+
+6. Disable the initial/way6 instruction TLBs second.  The very next
+   instruction following the invalidation of the currently-executing
+   code page will then cause a TLB refill exception, which will work
+   normally because we just resolved the final double-map condition.
+   (Pedantic note: if the vector page and the currently-executing page
+   are in different 512M way6 pages, disable the mapping for the
+   exception handlers first so the trap from our current code can be
+   handled.  Currently Zephyr doesn't handle this condition as in all
+   reasonable hardware these regions will be near each other)
+
+Note: there is a different variant of the Xtensa MMU architecture
+where the way 5/6 pages are immutable, and specify a set of
+unchangable mappings from the final 384M of memory to the bottom and
+top of physical memory.  The intent here would (presumably) be that
+these would be used by the kernel for all physical memory and that the
+remaining memory space would be used for virtual mappings.  This
+doesn't match Zephyr's architecture well, as we tend to assume
+page-level control over physical memory (e.g. .text/.rodata is cached
+but .data is not on SMP, etc...).  And in any case we don't have any
+such hardware to experiment with.  But with a little address
+translation we could support this.
+
+## ASID vs. Virtual Mapping
+
+The ASID mechanism in Xtensa works like other architectures, and is
+intended to be used similarly.  The intent of the design is that at
+context switch time, you can simply change RADID and the page table
+data, and leave any existing mappings in place in the TLB using the
+old ASID value(s).  So in the common case where you switch back,
+nothing needs to be flushed.
+
+Unfortunately this runs afoul of the virtual mapping of the page
+refill: data TLB entries storing the 4M page table mapping space are
+stored at ASID 1 (ring 0), they can't change when the page tables
+change!  So this region naively would have to be flushed, which is
+tantamount to flushing the entire TLB regardless (the TLB is much
+smaller than the 1024-page PTE array).
+
+The resolution in Zephyr is to give each ASID its own PTEVADDR mapping
+in virtual space, such that the page tables don't overlap.  This is
+expensive in virtual address space: assigning 4M of space to each of
+the 256 ASIDs (actually 254 as 0 and 1 are never used by user access)
+would take a full gigabyte of address space.  Zephyr optimizes this a
+bit by deriving a unique sequential ASID from the hardware address of
+the statically allocated array of L1 page table pages.
+
+Note, obviously, that any change of the mappings within an ASID
+(e.g. to re-use it for another memory domain, or just for any runtime
+mapping change other than mapping previously-unmapped pages) still
+requires a TLB flush, and always will.
+
+## SMP/Cache Interaction
+
+A final important note is that the hardware PTE refill fetch works
+like any other CPU memory access, and in particular it is governed by
+the cacheability attributes of the TLB entry through which it was
+loaded.  This means that if the page table entries are marked
+cacheable, then the hardware TLB refill process will be downstream of
+the L1 data cache on the CPU.  If the physical memory storing page
+tables has been accessed recently by the CPU (for a refill of another
+page mapped within the same cache line, or to change the tables) then
+the refill will be served from the data cache and not main memory.
+
+This may or may not be desirable depending on access patterns.  It
+lets the L1 data cache act as a "L2 TLB" for applications with a lot
+of access variability.  But it also means that the TLB entries end up
+being stored twice in the same CPU, wasting transistors that could
+presumably store other useful data.
+
+But it it also important to note that the L1 data cache on Xtensa is
+incoherent!  The cache being used for refill reflects the last access
+on the current CPU only, and not of the underlying memory being
+mapped.  Page table changes in the data cache of one CPU will be
+invisible to the data cache of another.  There is no simple way of
+notifying another CPU of changes to page mappings beyond doing
+system-wide flushes on all cpus every time a memory domain is
+modified.
+
+The result is that, when SMP is enabled, Zephyr must ensure that all
+page table mappings in the system are set uncached.  The OS makes no
+attempt to bolt on a software coherence layer.
diff --git a/arch/xtensa/core/include/xtensa_mmu_priv.h b/arch/xtensa/core/include/xtensa_mmu_priv.h
index cf72c92138373cb..7b1030786f424d3 100644
--- a/arch/xtensa/core/include/xtensa_mmu_priv.h
+++ b/arch/xtensa/core/include/xtensa_mmu_priv.h
@@ -132,15 +132,8 @@
  *
  * PTE_ENTRY_ADDRESS = PTEVADDR + ((VADDR / 4096) * 4)
  */
-#define Z_XTENSA_PTE_ENTRY_VADDR(vaddr) \
-	(Z_XTENSA_PTEVADDR + (((vaddr) / KB(4)) * 4))
-
-/*
- * The address of the top level page where the page
- * is located in the virtual address.
- */
-#define Z_XTENSA_PAGE_TABLE_VADDR \
-	Z_XTENSA_PTE_ENTRY_VADDR(Z_XTENSA_PTEVADDR)
+#define Z_XTENSA_PTE_ENTRY_VADDR(base, vaddr) \
+	((base) + (((vaddr) / KB(4)) * 4))
 
 /*
  * Get asid for a given ring from rasid register.
@@ -349,4 +342,8 @@ static inline void xtensa_dtlb_vaddr_invalidate(void *vaddr)
 	}
 }
 
+void xtensa_init_paging(uint32_t *l1_page);
+
+void xtensa_set_paging(uint32_t asid, uint32_t *l1_page);
+
 #endif /* ZEPHYR_ARCH_XTENSA_XTENSA_MMU_PRIV_H_ */
diff --git a/arch/xtensa/core/mmu.c b/arch/xtensa/core/mmu.c
new file mode 100644
index 000000000000000..24e47a42b9ded3b
--- /dev/null
+++ b/arch/xtensa/core/mmu.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright 2023 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <stdint.h>
+#include <stdbool.h>
+#include <zephyr/kernel.h>
+#include <xtensa/config/core-isa.h>
+#include <xtensa_mmu_priv.h>
+
+#define ASID_INVALID 0
+
+struct tlb_regs {
+	uint32_t rasid;
+	uint32_t ptevaddr;
+	uint32_t ptepin_as;
+	uint32_t ptepin_at;
+	uint32_t vecpin_as;
+	uint32_t vecpin_at;
+};
+
+static void compute_regs(uint32_t user_asid, uint32_t *l1_page, struct tlb_regs *regs)
+{
+	uint32_t vecbase = XTENSA_RSR("VECBASE");
+
+	__ASSERT_NO_MSG((((uint32_t)l1_page) & 0xfff) == 0);
+	__ASSERT_NO_MSG((user_asid == 0) || ((user_asid > 2) &&
+				(user_asid < Z_XTENSA_MMU_SHARED_ASID)));
+
+	/* We don't use ring 1, ring 0 ASID must be 1 */
+	regs->rasid = (Z_XTENSA_MMU_SHARED_ASID << 24) |
+		      (user_asid << 16) | 0x000201;
+
+	/* Derive PTEVADDR from ASID so each domain gets its own PTE area */
+	regs->ptevaddr = CONFIG_XTENSA_MMU_PTEVADDR + user_asid * 0x400000;
+
+	/* The ptables code doesn't add the mapping for the l1 page itself */
+	l1_page[Z_XTENSA_L1_POS(regs->ptevaddr)] =
+		(uint32_t)l1_page | Z_XTENSA_PAGE_TABLE_ATTR;
+
+	regs->ptepin_at = (uint32_t)l1_page;
+	regs->ptepin_as = Z_XTENSA_PTE_ENTRY_VADDR(regs->ptevaddr, regs->ptevaddr)
+			  | Z_XTENSA_MMU_PTE_WAY;
+
+	/* Pin mapping for refilling the vector address into the ITLB
+	 * (for handling TLB miss exceptions). Note: this is NOT an
+	 * instruction TLB entry for the vector code itself, it's a
+	 * DATA TLB entry for the page containing the vector mapping
+	 * so the refill on instruction fetch can find it. The
+	 * hardware doesn't have a 4k pinnable instruction TLB way,
+	 * frustratingly.
+	 */
+	uint32_t vb_pte = l1_page[Z_XTENSA_L1_POS(vecbase)];
+
+	regs->vecpin_at = vb_pte;
+	regs->vecpin_as = Z_XTENSA_PTE_ENTRY_VADDR(regs->ptevaddr, vecbase)
+			  | Z_XTENSA_MMU_VECBASE_WAY;
+}
+
+/* Switch to a new page table.  There are four items we have to set in
+ * the hardware: the PTE virtual address, the ring/ASID mapping
+ * register, and two pinned entries in the data TLB handling refills
+ * for the page tables and the vector handlers.
+ *
+ * These can be done in any order, provided that we ensure that no
+ * memory access which cause a TLB miss can happen during the process.
+ * This means that we must work entirely within registers in a single
+ * asm block.  Also note that instruction fetches are memory accesses
+ * too, which means we cannot cross a page boundary which might reach
+ * a new page not in the TLB (a single jump to an aligned address that
+ * holds our five instructions is sufficient to guarantee that: I
+ * couldn't think of a way to do the alignment statically that also
+ * interoperated well with inline assembly).
+ */
+void xtensa_set_paging(uint32_t user_asid, uint32_t *l1_page)
+{
+	/* Optimization note: the registers computed here are pure
+	 * functions of the two arguments.  With a minor API tweak,
+	 * they could be cached in e.g. a thread struct instead of
+	 * being recomputed.  This is called on context switch paths
+	 * and is performance-sensitive.
+	 */
+	struct tlb_regs regs;
+
+	compute_regs(user_asid, l1_page, &regs);
+
+	__asm__ volatile("j 1f\n"
+			 ".align 16\n" /* enough for 5 insns */
+			 "1:\n"
+			 "wsr %0, PTEVADDR\n"
+			 "wsr %1, RASID\n"
+			 "wdtlb %2, %3\n"
+			 "wdtlb %4, %5\n"
+			 "isync"
+			 :: "r"(regs.ptevaddr), "r"(regs.rasid),
+			    "r"(regs.ptepin_at), "r"(regs.ptepin_as),
+			    "r"(regs.vecpin_at), "r"(regs.vecpin_as));
+}
+
+/* This is effectively the same algorithm from xtensa_set_paging(),
+ * but it also disables the hardware-initialized 512M TLB entries in
+ * way 6 (because the hardware disallows duplicate TLB mappings).  For
+ * instruction fetches this produces a critical ordering constraint:
+ * the instruction following the invalidation of ITLB entry mapping
+ * the current PC will by definition create a refill condition, which
+ * will (because the data TLB was invalidated) cause a refill
+ * exception.  Therefore this step must be the very last one, once
+ * everything else is setup up and working, which includes the
+ * invalidation of the virtual PTEVADDR area so that the resulting
+ * refill can complete.
+ *
+ * Note that we can't guarantee that the compiler won't insert a data
+ * fetch from our stack memory after exit from the asm block (while it
+ * might be double-mapped), so we invalidate that data TLB inside the
+ * asm for correctness.  The other 13 entries get invalidated in a C
+ * loop at the end.
+ */
+void xtensa_init_paging(uint32_t *l1_page)
+{
+	extern char z_xt_init_pc; /* defined in asm below */
+	struct tlb_regs regs;
+
+#if CONFIG_MP_MAX_NUM_CPUS > 1
+	/* The incoherent cache can get into terrible trouble if it's
+	 * allowed to cache PTEs differently across CPUs.  We require
+	 * that all page tables supplied by the OS have exclusively
+	 * uncached mappings for page data, but can't do anything
+	 * about earlier code/firmware.  Dump the cache to be safe.
+	 */
+	sys_cache_data_flush_and_invd_all();
+#endif
+
+	compute_regs(ASID_INVALID, l1_page, &regs);
+
+	uint32_t idtlb_pte = (regs.ptevaddr & 0xe0000000) | XCHAL_SPANNING_WAY;
+	uint32_t idtlb_stk = (((uint32_t)&regs) & ~0xfff) | XCHAL_SPANNING_WAY;
+	uint32_t iitlb_pc  = (((uint32_t)&z_xt_init_pc) & ~0xfff) | XCHAL_SPANNING_WAY;
+
+	/* Note: the jump is mostly pedantry, as it's almost
+	 * inconceivable that a hardware memory region at boot is
+	 * going to cross a 512M page boundary.  But we need the entry
+	 * symbol to get the address above, so the jump is here for
+	 * symmetry with the set_paging() code.
+	 */
+	__asm__ volatile("j z_xt_init_pc\n"
+			 ".align 32\n" /* room for 10 insns */
+			 ".globl z_xt_init_pc\n"
+			 "z_xt_init_pc:\n"
+			 "wsr %0, PTEVADDR\n"
+			 "wsr %1, RASID\n"
+			 "wdtlb %2, %3\n"
+			 "wdtlb %4, %5\n"
+			 "idtlb %6\n" /* invalidate pte */
+			 "idtlb %7\n" /* invalidate stk */
+			 "isync\n"
+			 "iitlb %8\n" /* invalidate pc */
+			 "isync\n" /* <--- traps a ITLB miss */
+			 :: "r"(regs.ptevaddr), "r"(regs.rasid),
+			    "r"(regs.ptepin_at), "r"(regs.ptepin_as),
+			    "r"(regs.vecpin_at), "r"(regs.vecpin_as),
+			    "r"(idtlb_pte), "r"(idtlb_stk), "r"(iitlb_pc));
+
+	/* Invalidate the remaining (unused by this function)
+	 * initialization entries. Now we're flying free with our own
+	 * page table.
+	 */
+	for (int i = 0; i < 8; i++) {
+		uint32_t ixtlb = (i * 0x2000000000) | XCHAL_SPANNING_WAY;
+
+		if (ixtlb != iitlb_pc) {
+			__asm__ volatile("iitlb %0" :: "r"(ixtlb));
+		}
+		if (ixtlb != idtlb_stk && ixtlb != idtlb_pte) {
+			__asm__ volatile("idtlb %0" :: "r"(ixtlb));
+		}
+	}
+	__asm__ volatile("isync");
+}
diff --git a/arch/xtensa/core/ptables.c b/arch/xtensa/core/ptables.c
index 14369aa2765f76f..ddbb0fdcfb7d3e1 100644
--- a/arch/xtensa/core/ptables.c
+++ b/arch/xtensa/core/ptables.c
@@ -218,42 +218,6 @@ static inline uint32_t *alloc_l2_table(void)
 	return NULL;
 }
 
-/**
- * @brief Switch page tables
- *
- * This switches the page tables to the incoming ones (@a ptables).
- * Since data TLBs to L2 page tables are auto-filled, @a dtlb_inv
- * can be used to invalidate these data TLBs. @a cache_inv can be
- * set to true to invalidate cache to the page tables.
- *
- * @param[in] ptables Page tables to be switched to.
- * @param[in] dtlb_inv True if to invalidate auto-fill data TLBs.
- * @param[in] cache_inv True if to invalidate cache to page tables.
- */
-static ALWAYS_INLINE void switch_page_tables(uint32_t *ptables, bool dtlb_inv, bool cache_inv)
-{
-	if (cache_inv) {
-		sys_cache_data_flush_and_invd_all();
-	}
-
-	/* Invalidate data TLB to L1 page table */
-	xtensa_dtlb_vaddr_invalidate((void *)Z_XTENSA_PAGE_TABLE_VADDR);
-
-	/* Now map the pagetable itself with KERNEL asid to avoid user thread
-	 * from tampering with it.
-	 */
-	xtensa_dtlb_entry_write_sync(
-		Z_XTENSA_PTE((uint32_t)ptables, Z_XTENSA_KERNEL_RING, Z_XTENSA_PAGE_TABLE_ATTR),
-		Z_XTENSA_TLB_ENTRY(Z_XTENSA_PAGE_TABLE_VADDR, Z_XTENSA_MMU_PTE_WAY));
-
-	if (dtlb_inv) {
-		/* Since L2 page tables are auto-refilled,
-		 * invalidate all of them to flush the old entries out.
-		 */
-		xtensa_tlb_autorefill_invalidate();
-	}
-}
-
 static void map_memory_range(const uint32_t start, const uint32_t end,
 			     const uint32_t attrs, bool shared)
 {
@@ -345,6 +309,17 @@ static void xtensa_init_page_tables(void)
 #if defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif
+	/* Finally, the direct-mapped pages used in the page tables
+	 * must be fixed up to use the same cache attribute (but these
+	 * must be writable, obviously).  They shouldn't be left at
+	 * the default.
+	 */
+	map_memory_range((uint32_t) &l1_page_table[0],
+			 (uint32_t) &l1_page_table[CONFIG_XTENSA_MMU_NUM_L1_TABLES],
+			 Z_XTENSA_PAGE_TABLE_ATTR | Z_XTENSA_MMU_W, false);
+	map_memory_range((uint32_t) &l2_page_tables[0],
+			 (uint32_t) &l2_page_tables[CONFIG_XTENSA_MMU_NUM_L2_TABLES],
+			 Z_XTENSA_PAGE_TABLE_ATTR | Z_XTENSA_MMU_W, false);
 
 	sys_cache_data_flush_all();
 }
@@ -356,9 +331,6 @@ __weak void arch_xtensa_mmu_post_init(bool is_core0)
 
 void z_xtensa_mmu_init(void)
 {
-	volatile uint8_t entry;
-	uint32_t ps, vecbase;
-
 	if (_current_cpu->id == 0) {
 		/* This is normally done via arch_kernel_init() inside z_cstart().
 		 * However, before that is called, we go through the sys_init of
@@ -369,111 +341,7 @@ void z_xtensa_mmu_init(void)
 		xtensa_init_page_tables();
 	}
 
-	/* Set the page table location in the virtual address */
-	xtensa_ptevaddr_set((void *)Z_XTENSA_PTEVADDR);
-
-	/* Set rasid */
-	xtensa_rasid_asid_set(Z_XTENSA_MMU_SHARED_ASID, Z_XTENSA_SHARED_RING);
-
-	/* Next step is to invalidate the tlb entry that contains the top level
-	 * page table. This way we don't cause a multi hit exception.
-	 */
-	xtensa_dtlb_entry_invalidate_sync(Z_XTENSA_TLB_ENTRY(Z_XTENSA_PAGE_TABLE_VADDR, 6));
-	xtensa_itlb_entry_invalidate_sync(Z_XTENSA_TLB_ENTRY(Z_XTENSA_PAGE_TABLE_VADDR, 6));
-
-	/* We are not using a flat table page, so we need to map
-	 * only the top level page table (which maps the page table itself).
-	 *
-	 * Lets use one of the wired entry, so we never have tlb miss for
-	 * the top level table.
-	 */
-	xtensa_dtlb_entry_write(Z_XTENSA_PTE((uint32_t)z_xtensa_kernel_ptables,
-					     Z_XTENSA_KERNEL_RING, Z_XTENSA_PAGE_TABLE_ATTR),
-			Z_XTENSA_TLB_ENTRY(Z_XTENSA_PAGE_TABLE_VADDR, Z_XTENSA_MMU_PTE_WAY));
-
-	/* Before invalidate the text region in the TLB entry 6, we need to
-	 * map the exception vector into one of the wired entries to avoid
-	 * a page miss for the exception.
-	 */
-	__asm__ volatile("rsr.vecbase %0" : "=r"(vecbase));
-
-	xtensa_itlb_entry_write_sync(
-		Z_XTENSA_PTE(vecbase, Z_XTENSA_KERNEL_RING,
-			Z_XTENSA_MMU_X | Z_XTENSA_MMU_CACHED_WT),
-		Z_XTENSA_TLB_ENTRY(
-			Z_XTENSA_PTEVADDR + MB(4), 3));
-
-	xtensa_dtlb_entry_write_sync(
-		Z_XTENSA_PTE(vecbase, Z_XTENSA_KERNEL_RING,
-			Z_XTENSA_MMU_X | Z_XTENSA_MMU_CACHED_WT),
-		Z_XTENSA_TLB_ENTRY(
-			Z_XTENSA_PTEVADDR + MB(4), 3));
-
-	/* Temporarily uses KernelExceptionVector for level 1 interrupts
-	 * handling. This is due to UserExceptionVector needing to jump to
-	 * _Level1Vector. The jump ('j') instruction offset is incorrect
-	 * when we move VECBASE below.
-	 */
-	__asm__ volatile("rsr.ps %0" : "=r"(ps));
-	ps &= ~PS_UM;
-	__asm__ volatile("wsr.ps %0; rsync" :: "a"(ps));
-
-	__asm__ volatile("wsr.vecbase %0; rsync\n\t"
-			:: "a"(Z_XTENSA_PTEVADDR + MB(4)));
-
-
-	/* Finally, lets invalidate all entries in way 6 as the page tables
-	 * should have already mapped the regions we care about for boot.
-	 */
-	for (entry = 0; entry < BIT(XCHAL_ITLB_ARF_ENTRIES_LOG2); entry++) {
-		__asm__ volatile("iitlb %[idx]\n\t"
-				 "isync"
-				 :: [idx] "a"((entry << 29) | 6));
-	}
-
-	for (entry = 0; entry < BIT(XCHAL_DTLB_ARF_ENTRIES_LOG2); entry++) {
-		__asm__ volatile("idtlb %[idx]\n\t"
-				 "dsync"
-				 :: [idx] "a"((entry << 29) | 6));
-	}
-
-	/* Map VECBASE to a fixed data TLB */
-	xtensa_dtlb_entry_write(
-			Z_XTENSA_PTE((uint32_t)vecbase,
-				     Z_XTENSA_KERNEL_RING, Z_XTENSA_MMU_CACHED_WB),
-			Z_XTENSA_TLB_ENTRY((uint32_t)vecbase, Z_XTENSA_MMU_VECBASE_WAY));
-
-	/*
-	 * Pre-load TLB for vecbase so exception handling won't result
-	 * in TLB miss during boot, and that we can handle single
-	 * TLB misses.
-	 */
-	xtensa_itlb_entry_write_sync(
-		Z_XTENSA_PTE(vecbase, Z_XTENSA_KERNEL_RING,
-			Z_XTENSA_MMU_X | Z_XTENSA_MMU_CACHED_WT),
-		Z_XTENSA_AUTOFILL_TLB_ENTRY(vecbase));
-
-	/* To finish, just restore vecbase and invalidate TLB entries
-	 * used to map the relocated vecbase.
-	 */
-	__asm__ volatile("wsr.vecbase %0; rsync\n\t"
-			:: "a"(vecbase));
-
-	/* Restore PS_UM so that level 1 interrupt handling will go to
-	 * UserExceptionVector.
-	 */
-	__asm__ volatile("rsr.ps %0" : "=r"(ps));
-	ps |= PS_UM;
-	__asm__ volatile("wsr.ps %0; rsync" :: "a"(ps));
-
-	xtensa_dtlb_entry_invalidate_sync(Z_XTENSA_TLB_ENTRY(Z_XTENSA_PTEVADDR + MB(4), 3));
-	xtensa_itlb_entry_invalidate_sync(Z_XTENSA_TLB_ENTRY(Z_XTENSA_PTEVADDR + MB(4), 3));
-
-	/*
-	 * Clear out THREADPTR as we use it to indicate
-	 * whether we are in user mode or not.
-	 */
-	XTENSA_WUR("THREADPTR", 0);
+	xtensa_init_paging(z_xtensa_kernel_ptables);
 
 	arch_xtensa_mmu_post_init(_current_cpu->id == 0);
 }
@@ -504,7 +372,7 @@ __weak void arch_reserved_pages_update(void)
 static bool l2_page_table_map(uint32_t *l1_table, void *vaddr, uintptr_t phys,
 			      uint32_t flags, bool is_user)
 {
-	uint32_t l1_pos = (uint32_t)vaddr >> 22;
+	uint32_t l1_pos = Z_XTENSA_L1_POS((uint32_t)vaddr);
 	uint32_t l2_pos = Z_XTENSA_L2_POS((uint32_t)vaddr);
 	uint32_t *table;
 
@@ -530,6 +398,7 @@ static bool l2_page_table_map(uint32_t *l1_table, void *vaddr, uintptr_t phys,
 				     flags);
 
 	sys_cache_data_flush_range((void *)&table[l2_pos], sizeof(table[0]));
+	xtensa_tlb_autorefill_invalidate();
 
 	return true;
 }
@@ -604,18 +473,6 @@ static inline void __arch_mem_map(void *va, uintptr_t pa, uint32_t xtensa_flags,
 		k_spin_unlock(&z_mem_domain_lock, key);
 	}
 #endif /* CONFIG_USERSPACE */
-
-	if ((xtensa_flags & Z_XTENSA_MMU_X) == Z_XTENSA_MMU_X) {
-		xtensa_itlb_vaddr_invalidate(vaddr);
-	}
-	xtensa_dtlb_vaddr_invalidate(vaddr);
-
-	if (IS_ENABLED(CONFIG_XTENSA_MMU_DOUBLE_MAP)) {
-		if (xtensa_flags & Z_XTENSA_MMU_X) {
-			xtensa_itlb_vaddr_invalidate(vaddr_uc);
-		}
-		xtensa_dtlb_vaddr_invalidate(vaddr_uc);
-	}
 }
 
 void arch_mem_map(void *virt, uintptr_t phys, size_t size, uint32_t flags)
@@ -680,7 +537,7 @@ void arch_mem_map(void *virt, uintptr_t phys, size_t size, uint32_t flags)
  */
 static bool l2_page_table_unmap(uint32_t *l1_table, void *vaddr)
 {
-	uint32_t l1_pos = (uint32_t)vaddr >> 22;
+	uint32_t l1_pos = Z_XTENSA_L1_POS((uint32_t)vaddr);
 	uint32_t l2_pos = Z_XTENSA_L2_POS((uint32_t)vaddr);
 	uint32_t *l2_table;
 	uint32_t table_pos;
@@ -718,8 +575,7 @@ static bool l2_page_table_unmap(uint32_t *l1_table, void *vaddr)
 	atomic_clear_bit(l2_page_tables_track, table_pos);
 
 	/* Need to invalidate L2 page table as it is no longer valid. */
-	xtensa_dtlb_vaddr_invalidate((void *)l2_table);
-
+	xtensa_tlb_autorefill_invalidate();
 end:
 	return exec;
 }
@@ -764,18 +620,6 @@ static inline void __arch_mem_unmap(void *va)
 	}
 	k_spin_unlock(&z_mem_domain_lock, key);
 #endif /* CONFIG_USERSPACE */
-
-	if (is_exec) {
-		xtensa_itlb_vaddr_invalidate(vaddr);
-	}
-	xtensa_dtlb_vaddr_invalidate(vaddr);
-
-	if (IS_ENABLED(CONFIG_XTENSA_MMU_DOUBLE_MAP)) {
-		if (is_exec) {
-			xtensa_itlb_vaddr_invalidate(vaddr_uc);
-		}
-		xtensa_dtlb_vaddr_invalidate(vaddr_uc);
-	}
 }
 
 void arch_mem_unmap(void *addr, size_t size)
@@ -853,7 +697,7 @@ void z_xtensa_mmu_tlb_shootdown(void)
 		 * MMU_PTE_WAY, so we can skip the probing step by
 		 * generating the query entry directly.
 		 */
-		ptevaddr_entry = Z_XTENSA_PAGE_TABLE_VADDR | MMU_PTE_WAY;
+		ptevaddr_entry = (uint32_t)xtensa_ptevaddr_get() | Z_XTENSA_MMU_PTE_WAY;
 		ptevaddr = xtensa_dtlb_paddr_read(ptevaddr_entry);
 
 		thread_ptables = (uint32_t)thread->arch.ptables;
@@ -863,7 +707,9 @@ void z_xtensa_mmu_tlb_shootdown(void)
 			 * indicated by the current thread are different
 			 * than the current mapped page table.
 			 */
-			switch_page_tables((uint32_t *)thread_ptables, true, true);
+			struct arch_mem_domain *domain =
+				&(thread->mem_domain_info.mem_domain->arch);
+			xtensa_set_paging(domain->asid, (uint32_t *)thread_ptables);
 		}
 
 	}
@@ -981,9 +827,8 @@ static int region_map_update(uint32_t *ptables, uintptr_t start,
 	for (size_t offset = 0; offset < size; offset += CONFIG_MMU_PAGE_SIZE) {
 		uint32_t *l2_table, pte;
 		uint32_t page = start + offset;
-		uint32_t l1_pos = page >> 22;
+		uint32_t l1_pos = Z_XTENSA_L1_POS(page);
 		uint32_t l2_pos = Z_XTENSA_L2_POS(page);
-
 		/* Make sure we grab a fresh copy of L1 page table */
 		sys_cache_data_invd_range((void *)&ptables[l1_pos], sizeof(ptables[0]));
 
@@ -998,8 +843,7 @@ static int region_map_update(uint32_t *ptables, uintptr_t start,
 
 		sys_cache_data_flush_range((void *)&l2_table[l2_pos], sizeof(l2_table[0]));
 
-		xtensa_dtlb_vaddr_invalidate(
-			(void *)(pte & Z_XTENSA_PTE_PPN_MASK));
+		xtensa_dtlb_vaddr_invalidate((void *)page);
 	}
 
 	return ret;
@@ -1127,7 +971,7 @@ int arch_mem_domain_thread_add(struct k_thread *thread)
 	 * the current thread running.
 	 */
 	if (thread == _current_cpu->current) {
-		switch_page_tables(thread->arch.ptables, true, true);
+		xtensa_set_paging(domain->arch.asid, thread->arch.ptables);
 	}
 
 #if CONFIG_MP_MAX_NUM_CPUS > 1
@@ -1179,7 +1023,7 @@ static bool page_validate(uint32_t *ptables, uint32_t page, uint8_t ring, bool w
 {
 	uint8_t asid_ring;
 	uint32_t rasid, pte, *l2_table;
-	uint32_t l1_pos = page >> 22;
+	uint32_t l1_pos = Z_XTENSA_L1_POS(page);
 	uint32_t l2_pos = Z_XTENSA_L2_POS(page);
 
 	if (is_pte_illegal(ptables[l1_pos])) {
@@ -1245,12 +1089,7 @@ void z_xtensa_swap_update_page_tables(struct k_thread *incoming)
 	struct arch_mem_domain *domain =
 		&(incoming->mem_domain_info.mem_domain->arch);
 
-	/* Lets set the asid for the incoming thread */
-	if ((incoming->base.user_options & K_USER) != 0) {
-		xtensa_rasid_asid_set(domain->asid, Z_XTENSA_USER_RING);
-	}
-
-	switch_page_tables(ptables, true, false);
+	xtensa_set_paging(domain->asid, ptables);
 }
 
 #endif /* CONFIG_USERSPACE */
diff --git a/arch/xtensa/include/xtensa-asm2-s.h b/arch/xtensa/include/xtensa-asm2-s.h
index 3f3ffd90b7ae043..416a83453a2d0e2 100644
--- a/arch/xtensa/include/xtensa-asm2-s.h
+++ b/arch/xtensa/include/xtensa-asm2-s.h
@@ -589,31 +589,6 @@ _Level\LVL\()VectorHelper :
 .global _Level\LVL\()Vector
 _Level\LVL\()Vector:
 #endif
-#ifdef CONFIG_XTENSA_MMU
-	wsr.ZSR_MMU_0 a2
-	wsr.ZSR_MMU_1 a3
-	rsync
-
-	/* Calculations below will clobber registers used.
-	 * So we make a copy of the stack pointer to avoid
-	 * changing it.
-	 */
-	mov a3, a1
-
-	CALC_PTEVADDR_BASE a2
-
-	/* Preload PTE entry page of current stack. */
-	PRELOAD_PTEVADDR a3, a2
-
-	/* Preload PTE entry page of new stack, where
-	 * it will be used later (in EXCINT_HANDLER above).
-	 */
-	rsr.ZSR_CPU a3
-	PRELOAD_PTEVADDR a3, a2
-
-	rsr.ZSR_MMU_1 a3
-	rsr.ZSR_MMU_0 a2
-#endif /* CONFIG_XTENSA_MMU */
 	addi a1, a1, -___xtensa_irq_bsa_t_SIZEOF
 	s32i a0, a1, ___xtensa_irq_bsa_t_a0_OFFSET
 	s32i a2, a1, ___xtensa_irq_bsa_t_a2_OFFSET
diff --git a/soc/xtensa/dc233c/mmu.c b/soc/xtensa/dc233c/mmu.c
index 718f79779ebd5ce..8decc952bf2ac66 100644
--- a/soc/xtensa/dc233c/mmu.c
+++ b/soc/xtensa/dc233c/mmu.c
@@ -35,33 +35,3 @@ const struct xtensa_mmu_range xtensa_soc_mmu_ranges[] = {
 };
 
 int xtensa_soc_mmu_ranges_num = ARRAY_SIZE(xtensa_soc_mmu_ranges);
-
-void arch_xtensa_mmu_post_init(bool is_core0)
-{
-	uint32_t vecbase;
-
-	ARG_UNUSED(is_core0);
-
-	__asm__ volatile("rsr.vecbase %0" : "=r"(vecbase));
-
-	/* Invalidate any autorefill instr TLBs of VECBASE so we can map it
-	 * permanently below.
-	 */
-	xtensa_itlb_vaddr_invalidate((void *)vecbase);
-
-	/* Map VECBASE permanently in instr TLB way 4 so we will always have
-	 * access to exception handlers. Each way 4 TLB covers 1MB (unless
-	 * ITLBCFG has been changed before this, which should not have
-	 * happened). Also this needs to be mapped as SHARED so both kernel
-	 * and userspace can execute code here => same as .text.
-	 *
-	 * Note that we don't want to map the first 1MB in data TLB as
-	 * we want to keep page 0 (0x00000000) unmapped to catch null pointer
-	 * de-references.
-	 */
-	vecbase = ROUND_DOWN(vecbase, MB(1));
-	xtensa_itlb_entry_write_sync(
-		Z_XTENSA_PTE(vecbase, Z_XTENSA_SHARED_RING,
-			     Z_XTENSA_MMU_X | Z_XTENSA_MMU_CACHED_WT),
-		Z_XTENSA_TLB_ENTRY((uint32_t)vecbase, 4));
-}