Remove some asm work-arounds

Starting with GCC 7 and clang 15, single-bit std::atomic::fetch_or() actually works, except for the most significant bit, which was fixed in GCC 13 and as of now, not in clang yet: llvm/llvm-project#37322 The Microsoft compiler still prefers to emit loops around LOCK CMPXCHG, so we will keep using another work-around on that platform. It should be noted that GCC 7 is the oldest compiler available in currently supported GNU/Linux distributions. FreeBSD 14 uses clang 16.
dr-m · Sep 23, 2024 · 6ec92dd · 6ec92dd
1 parent 8c16b1e
commit 6ec92dd
Show file tree

Hide file tree

Showing 4 changed files with 72 additions and 59 deletions.
diff --git a/README.md b/README.md
@@ -163,6 +163,35 @@ GNU/Linux, presumably because the built-in instrumentation for
 `pthread_mutex_t` interferes with the additional instrumentation in
 `atomic_mutex`.
 
+### Target limitations around atomic operations
+
+Some instruction set architectures (ISA) seriously limit the choice of
+atomic operations that may be executed efficiently.  On the commonly
+used IA-32 or x86-64 ISA, compilers may fall back to the generic case
+of generating a loop around a compare-and-swap (`lock cmpxchg`).
+
+On x86, the most straightforward read-modify-write operations are
+`std::atomic::fetch_add()` or `std::atomic::fetch_sub()`, both of
+which translate into the 80486 `lock xadd` instruction.
+
+Depending on the compiler, some single-bit versions of `fetch_or()`,
+`fetch_and()` or `fetch_xor()` may be translated into the 80386
+instructions `lock bts`, `lock btr`, or `lock btc`.  This mostly works
+starting with GCC 7 or clang 15.  There are additional limitations
+regarding the most significant bit; these were lifted in GCC 13 but
+not clang (version 19 as of now).  The Microsoft compiler still
+requires its own intrinsics such as `_interlockedbittestandset()` to
+be used.
+
+On POWER and ARM, atomic operations traditionally translate into a
+loop around load-locked followed by store-conditional (LL/SC).  The
+ARMv8.1-a revision includes Large System Extensions (LSE) includes not
+only the `ldadd` instruction for `fetch_add()` and `fetch_sub()` but
+also corresponding instructions for `fetch_and()`, `fetch_xor()`,
+`fetch_or()`.  If you need to support multiple ARM ISA revisions, you
+may want to enable the compiler option `-moutline-atomics` so that the
+best option may be chosen at run time.
+
 ### Lock elision
 
 The `transactional_lock_guard` is like `std::lock_guard` but designed

diff --git a/atomic_sync/atomic_mutex.cc b/atomic_sync/atomic_mutex.cc
@@ -35,39 +35,8 @@ inline void mutex_storage<uint32_t>::wait(uint32_t old) const noexcept
 {FUTEX(WAIT, &m, old);}
 #endif
 
-/*
-
-Unfortunately, compilers targeting IA-32 or AMD64 currently cannot
-translate the following single-bit operations into Intel 80386 instructions:
-
-     m.fetch_or(1<<b) & 1<<b       LOCK BTS b, m
-     m.fetch_and(~(1<<b)) & 1<<b   LOCK BTR b, m
-     m.fetch_xor(1<<b) & 1<<b      LOCK BTC b, m
-
-In g++-12 and clang++-15 this actually works, except for b==31:
-https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102566
-https://github.com/llvm/llvm-project/issues/37322
-
-Hence, we will manually translate fetch_or() using GCC-style inline
-assembler code or a MSVC intrinsic function.
-
-*/
-#if defined __clang_major__ && __clang_major__ < 10
-/* Only clang-10 introduced support for asm goto */
-#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
-# define IF_FETCH_OR_GOTO(mem, bit, label)				\
-  __asm__ goto("lock btsl $" #bit ", %0\n\t"				\
-               "jc %l1" : : "m" (mem) : "cc", "memory" : label);
-# define IF_NOT_FETCH_OR_GOTO(mem, bit, label)				\
-  __asm__ goto("lock btsl $" #bit ", %0\n\t"				\
-               "jnc %l1" : : "m" (mem) : "cc", "memory" : label);
-#elif defined _MSC_VER && (defined _M_IX86 || defined _M_IX64)
-# define IF_FETCH_OR_GOTO(mem, bit, label)				\
-  if (_interlockedbittestandset(reinterpret_cast<volatile long*>(&mem), bit)) \
-    goto label;
-# define IF_NOT_FETCH_OR_GOTO(mem, bit, label)				\
-  if (!_interlockedbittestandset(reinterpret_cast<volatile long*>(&mem), bit))\
-    goto label;
+#ifdef _WIN32
+# include <windows.h>
 #endif
 
 template<typename T>
@@ -83,17 +52,19 @@ void mutex_storage<T>::lock_wait() noexcept
 #else
       m.wait(lk);
 #endif
-#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_IX64
+#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
     reload:
 #endif
       lk = m.load(std::memory_order_relaxed);
     }
-#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_IX64
+#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
     else
     {
-# ifdef IF_FETCH_OR_GOTO
-      static_assert(HOLDER == (1U << 31), "compatibility");
-      IF_FETCH_OR_GOTO(*this, 31, reload);
+# ifdef _MSC_VER
+      static_assert(HOLDER == (1U << 0), "compatibility");
+      if (_interlockedbittestandset
+          (reinterpret_cast<volatile long*>(this), 0))
+        goto reload;
 # else
       if (m.fetch_or(HOLDER, std::memory_order_relaxed) & HOLDER)
         goto reload;
@@ -115,10 +86,6 @@ void mutex_storage<T>::lock_wait() noexcept
   }
 }
 
-#ifdef _WIN32
-# include <windows.h>
-#endif
-
 #ifdef __GNUC__
 __attribute__((noinline))
 #elif defined _MSC_VER
@@ -157,10 +124,16 @@ void mutex_storage<T>::spin_lock_wait(unsigned spin_rounds) noexcept
     lk = m.load(std::memory_order_relaxed);
     if (!(lk & HOLDER))
     {
-#ifdef IF_NOT_FETCH_OR_GOTO
-      static_assert(HOLDER == (1U << 31), "compatibility");
-      IF_NOT_FETCH_OR_GOTO(*this, 31, acquired);
-      lk|= HOLDER;
+#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
+      lk += HOLDER;
+# ifdef _MSC_VER
+      static_assert(HOLDER == (1U << 0), "compatibility");
+      if (!_interlockedbittestandset
+          (reinterpret_cast<volatile long*>(this), 0))
+# else
+      if (!(m.fetch_or(HOLDER, std::memory_order_relaxed) & HOLDER))
+# endif
+        goto acquired;
 #else
       if (!((lk = m.fetch_or(HOLDER, std::memory_order_relaxed)) & HOLDER))
         goto acquired;
@@ -180,16 +153,22 @@ void mutex_storage<T>::spin_lock_wait(unsigned spin_rounds) noexcept
 #else
       m.wait(lk);
 #endif
-#ifdef IF_FETCH_OR_GOTO
+#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
     reload:
 #endif
       lk = m.load(std::memory_order_relaxed);
     }
     else
     {
-#ifdef IF_FETCH_OR_GOTO
-      static_assert(HOLDER == (1U << 31), "compatibility");
-      IF_FETCH_OR_GOTO(*this, 31, reload);
+#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
+# ifdef _MSC_VER
+      static_assert(HOLDER == (1U << 0), "compatibility");
+      if (_interlockedbittestandset
+          (reinterpret_cast<volatile long*>(this), 0))
+# else
+      if (m.fetch_or(HOLDER, std::memory_order_relaxed) & HOLDER)
+# endif
+        goto reload;
 #else
       if ((lk = m.fetch_or(HOLDER, std::memory_order_relaxed)) & HOLDER)
         continue;
@@ -209,12 +188,12 @@ template void mutex_storage<uint32_t>::spin_lock_wait(unsigned) noexcept;
 template<typename T>
 void shared_mutex_storage<T>::lock_inner_wait(T lk) noexcept
 {
-  assert(lk < X);
+  assert(!(lk & X));
   lk |= X;
 
   do
   {
-    assert(lk > X);
+    assert(lk & X);
 #if !defined _WIN32 && __cplusplus < 202002L /* Emulate the C++20 primitives */
     FUTEX(WAIT, &inner, lk);
 #else

diff --git a/atomic_sync/atomic_mutex.h b/atomic_sync/atomic_mutex.h
@@ -12,8 +12,8 @@ class mutex_storage
   // exposition only; see test_native_mutex for a possible alternative
   std::atomic<type> m;
 
-  static constexpr type HOLDER = type(~(type(~type(0)) >> 1));
-  static constexpr type WAITER = 1;
+  static constexpr type HOLDER = 1;
+  static constexpr type WAITER = 2;
 
 public:
   constexpr bool is_locked() const noexcept

diff --git a/atomic_sync/atomic_shared_mutex.h b/atomic_sync/atomic_shared_mutex.h
@@ -62,11 +62,16 @@ class shared_mutex_storage
   @retval 0 if the exclusive lock was granted */
   type lock_inner() noexcept
   {
-#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_IX64
-    /* On IA-32 and AMD64, this type of fetch_or() can only be implemented
-    as a loop around LOCK CMPXCHG. In this particular case, toggling the
-    most significant bit using fetch_add() is equivalent, and is
-    translated into a simple LOCK XADD. */
+#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
+    static_assert(X == 1U << 31, "compatibility");
+    /* On IA-32 and AMD64, a fetch_XXX() that needs to return the
+    previous value of the word state can only be implemented
+    efficiently for fetch_add() or fetch_sub(), both of which
+    translate into a 80486 LOCK XADD instruction.  Anything else would
+    translate into a loop around LOCK CMPXCHG.  In this particular
+    case, we know that the bit was previously clear, and therefore
+    setting (actually toggling) the most significant bit using
+    fetch_add() or fetch_sub() is equivalent. */
     return inner.fetch_add(X, std::memory_order_acquire);
 #endif
     return inner.fetch_or(X, std::memory_order_acquire);