alexheretic · alexheretic · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,6 @@
 # Unreleased (v1.2.0)
+* Windows: Use a high resolution waitable timer when available (>= Windows 10, version 1803).
+* Windows: Replace _winapi_ with _windows-sys_ dependency.
 * Windows: Remove _once_cell_ dependency.
 
 # v1.1.1

diff --git a/Cargo.toml b/Cargo.toml
@@ -10,7 +10,9 @@ license = "Apache-2.0"
 readme="README.md"
 
 [target.'cfg(windows)'.dependencies]
-winapi = { version = "0.3", features = ["minwindef", "mmsystem", "timeapi"] }
+windows-sys = { version = "0.52", features = [
+    "Win32_Foundation", "Win32_Security", "Win32_System", "Win32_System_Threading", "Win32_Media"
+] }
 
 [dev-dependencies]
 approx = "0.5"

diff --git a/README.md b/README.md
@@ -47,8 +47,10 @@ let sleeper = SpinSleeper::default();
 ```
 
 ### Windows Accuracy
-Windows has particularly poor accuracy by default (~15ms), `spin_sleep` will automatically
-select the best accuracy on Windows generally achieving ~1ms native sleep accuracy *(Since 0.3.3)*.
+Windows (>= Windows 10, version 1803) will use a high resolution waitable timer, similar to sleep in rust std >= 1.75.
+
+Earlier versions of Windows have particularly poor accuracy by default (~15ms), `spin_sleep` will automatically
+select the best accuracy on Windows generally achieving ~1-2ms native sleep accuracy.
 
 ## Minimum supported rust compiler
 This crate is maintained with [latest stable rust](https://gist.github.com/alexheretic/d1e98d8433b602e57f5d0a9637927e0c).
diff --git a/experiments/README.md b/experiments/README.md
@@ -3,20 +3,60 @@ Experiments to measure latency all machine specific & non-deterministic but are
 good default settings for _spin_sleep_.
 
 ## native_sleep_accuracy
-Call OS native sleep for **1ns** and see how long it actually takes.
+Call OS native sleep for **1ns, 1µs & 1ms** and see how long it actually takes.
 
 ```sh
 cargo run --bin native_sleep_accuracy --release
 ```
 
 **Linux example output** *
 ```
-average: 53.04µs, best : 7.95µs, worst: 85.238µs
+==> sleep 1ns
+average: 54.0µs, best: 8.7µs, worst: 94.1µs
+==> sleep 1µs
+average: 55.1µs, best: 8.3µs, worst: 60.4µs
+==> sleep 1ms
+average: 1.055ms, best: 1.054ms, worst: 1.058ms
 ```
 
 **Windows example output** *
 ```
-average: 2.012432ms, best : 2.0069ms, worst: 2.1455ms
+==> sleep 1ns
+average: 2.0µs, best: 1.3µs, worst: 13.9µs
+==> sleep 1µs
+average: 446.7µs, best: 2.3µs, worst: 725.8µs
+==> sleep 1ms
+average: 1.775ms, best: 1.502ms, worst: 2.012ms
+```
+
+### Under high load
+Do the same measurement as above but while all cores are being stressed.
+```sh
+cargo run --bin native_sleep_accuracy --release -- load
+```
+
+**Linux example output** *
+Generally similar to no load, but more likely to produce occasional high latency.
+```
+Simulating 16 thread load
+==> sleep 1ns
+average: 53.8µs, best: 7.1µs, worst: 231.6µs
+==> sleep 1µs
+average: 58.0µs, best: 7.6µs, worst: 3.3ms
+==> sleep 1ms
+average: 1.054ms, best: 1.054ms, worst: 1.055ms
+```
+
+**Windows example output** *
+High latency is fairly common.
+```
+Simulating 16 thread load
+==> sleep 1ns
+average: 39.3µs, best: 1.8µs, worst: 36.8ms
+==> sleep 1µs
+average: 14.9ms, best: 2.1µs, worst: 46.9ms
+==> sleep 1ms
+average: 16.025ms, best: 2.004ms, worst: 30.071ms
 ```
 
 ## spin_strategy_latency
@@ -30,38 +70,38 @@ cargo run --bin spin_strategy_latency --release
 **Linux example output** *
 ```
 warming up...
-5ms    None          avg-spins: 191610   avg-actual: 5.000044ms
-5ms    SpinLoopHint  avg-spins: 176594   avg-actual: 5.000045ms
-5ms    YieldThread   avg-spins: 38366    avg-actual: 5.000105ms
-900µs  None          avg-spins: 34340    avg-actual: 900.05µs
-900µs  SpinLoopHint  avg-spins: 31633    avg-actual: 900.052µs
-900µs  YieldThread   avg-spins: 6843     avg-actual: 900.104µs
-5µs    None          avg-spins: 186      avg-actual: 5.04µs
-5µs    SpinLoopHint  avg-spins: 173      avg-actual: 5.048µs
-5µs    YieldThread   avg-spins: 38       avg-actual: 5.075µs
-100ns  None          avg-spins: 3        avg-actual: 135ns
+5ms    None          avg-spins: 231633   avg-actual: 5.000039ms
+5ms    SpinLoopHint  avg-spins: 168571   avg-actual: 5.000041ms
+5ms    YieldThread   avg-spins: 8431     avg-actual: 5.000323ms
+900µs  None          avg-spins: 41194    avg-actual: 900.039µs
+900µs  SpinLoopHint  avg-spins: 30094    avg-actual: 900.044µs
+900µs  YieldThread   avg-spins: 1527     avg-actual: 900.349µs
+5µs    None          avg-spins: 231      avg-actual: 5.033µs
+5µs    SpinLoopHint  avg-spins: 167      avg-actual: 5.063µs
+5µs    YieldThread   avg-spins: 9        avg-actual: 5.229µs
+100ns  None          avg-spins: 4        avg-actual: 129ns
 100ns  SpinLoopHint  avg-spins: 3        avg-actual: 132ns
-100ns  YieldThread   avg-spins: 1        avg-actual: 181ns
+100ns  YieldThread   avg-spins: 1        avg-actual: 625ns
 ```
 
 **Windows example output** *
 ```
 warming up...
-5ms    None          avg-spins: 158591   avg-actual: 5ms
-5ms    SpinLoopHint  avg-spins: 134568   avg-actual: 5ms
-5ms    YieldThread   avg-spins: 50380    avg-actual: 5.000039ms
-900µs  None          avg-spins: 28491    avg-actual: 900µs
-900µs  SpinLoopHint  avg-spins: 24128    avg-actual: 900.002µs
-900µs  YieldThread   avg-spins: 9070     avg-actual: 900.033µs
-5µs    None          avg-spins: 155      avg-actual: 5µs
-5µs    SpinLoopHint  avg-spins: 133      avg-actual: 5µs
-5µs    YieldThread   avg-spins: 49       avg-actual: 5.042µs
+5ms    None          avg-spins: 176820   avg-actual: 5ms
+5ms    SpinLoopHint  avg-spins: 164060   avg-actual: 5ms
+5ms    YieldThread   avg-spins: 31789    avg-actual: 5.000064ms
+900µs  None          avg-spins: 31791    avg-actual: 900µs
+900µs  SpinLoopHint  avg-spins: 29406    avg-actual: 900.021µs
+900µs  YieldThread   avg-spins: 5700     avg-actual: 900.063µs
+5µs    None          avg-spins: 139      avg-actual: 5µs
+5µs    SpinLoopHint  avg-spins: 160      avg-actual: 5µs
+5µs    YieldThread   avg-spins: 31       avg-actual: 5.09µs
 100ns  None          avg-spins: 0        avg-actual: 100ns
 100ns  SpinLoopHint  avg-spins: 0        avg-actual: 100ns
-100ns  YieldThread   avg-spins: 1        avg-actual: 102ns
+100ns  YieldThread   avg-spins: 0        avg-actual: 172ns
 ```
 
-## spin_strategy_latency under load
+### Under high load
 Do the same measurement as above but while all cores are being stressed.
 
 ```sh
@@ -72,36 +112,36 @@ cargo run --bin spin_strategy_latency --release -- load
 ```
 Simulating 16 thread load
 warming up...
-5ms    None          avg-spins: 159018   avg-actual: 5.000058ms
-5ms    SpinLoopHint  avg-spins: 122263   avg-actual: 5.000065ms
-5ms    YieldThread   avg-spins: 23265    avg-actual: 5.000327ms
-900µs  None          avg-spins: 27748    avg-actual: 938.427µs
-900µs  SpinLoopHint  avg-spins: 21727    avg-actual: 900.062µs
-900µs  YieldThread   avg-spins: 4054     avg-actual: 901.31µs
-5µs    None          avg-spins: 157      avg-actual: 5.055µs
-5µs    SpinLoopHint  avg-spins: 122      avg-actual: 5.057µs
-5µs    YieldThread   avg-spins: 23       avg-actual: 5.07µs
-100ns  None          avg-spins: 2        avg-actual: 147ns
-100ns  SpinLoopHint  avg-spins: 1        avg-actual: 135ns
-100ns  YieldThread   avg-spins: 1        avg-actual: 278ns
+5ms    None          avg-spins: 170998   avg-actual: 5.374337ms
+5ms    SpinLoopHint  avg-spins: 110830   avg-actual: 5.385263ms
+5ms    YieldThread   avg-spins: 6457     avg-actual: 5.000448ms
+900µs  None          avg-spins: 34035    avg-actual: 900.045µs
+900µs  SpinLoopHint  avg-spins: 21661    avg-actual: 900.051µs
+900µs  YieldThread   avg-spins: 1132     avg-actual: 900.54µs
+5µs    None          avg-spins: 186      avg-actual: 5.18µs
+5µs    SpinLoopHint  avg-spins: 117      avg-actual: 5.124µs
+5µs    YieldThread   avg-spins: 6        avg-actual: 5.621µs
+100ns  None          avg-spins: 3        avg-actual: 128ns
+100ns  SpinLoopHint  avg-spins: 2        avg-actual: 131ns
+100ns  YieldThread   avg-spins: 1        avg-actual: 898ns
 ```
 
 **Windows example output** *
 ```
 Simulating 16 thread load
 warming up...
-5ms    None          avg-spins: 105568   avg-actual: 5.838449ms
-5ms    SpinLoopHint  avg-spins: 79548    avg-actual: 5.608363ms
-5ms    YieldThread   avg-spins: 1        avg-actual: 17.526351ms
-900µs  None          avg-spins: 19461    avg-actual: 1.127537ms
-900µs  SpinLoopHint  avg-spins: 14578    avg-actual: 1.326708ms
-900µs  YieldThread   avg-spins: 1        avg-actual: 17.526448ms
-5µs    None          avg-spins: 108      avg-actual: 5µs
-5µs    SpinLoopHint  avg-spins: 79       avg-actual: 6.298µs
-5µs    YieldThread   avg-spins: 1        avg-actual: 11.417271ms
-100ns  None          avg-spins: 1        avg-actual: 101ns
-100ns  SpinLoopHint  avg-spins: 0        avg-actual: 102ns
-100ns  YieldThread   avg-spins: 0        avg-actual: 7.716038ms
-```
-
-\* _Measured 2022-02-18 with a AMD 5800X_.
+5ms    None          avg-spins: 140709   avg-actual: 5.604986ms
+5ms    SpinLoopHint  avg-spins: 108241   avg-actual: 5.81583ms
+5ms    YieldThread   avg-spins: 3        avg-actual: 32.039572ms
+900µs  None          avg-spins: 27701    avg-actual: 902.595µs
+900µs  SpinLoopHint  avg-spins: 20202    avg-actual: 1.210891ms
+900µs  YieldThread   avg-spins: 1        avg-actual: 11.297962ms
+5µs    None          avg-spins: 153      avg-actual: 5µs
+5µs    SpinLoopHint  avg-spins: 110      avg-actual: 5µs
+5µs    YieldThread   avg-spins: 1        avg-actual: 13.948654ms
+100ns  None          avg-spins: 0        avg-actual: 100ns
+100ns  SpinLoopHint  avg-spins: 0        avg-actual: 100ns
+100ns  YieldThread   avg-spins: 0        avg-actual: 2.882577ms
+```
+
+\* _Measured 2023-01-02 with a AMD 5800X_.
diff --git a/experiments/src/bin/native_sleep_accuracy.rs b/experiments/src/bin/native_sleep_accuracy.rs
@@ -1,4 +1,4 @@
-//! Call OS native sleep for **1ns** and see how long it actually takes.
+//! Call OS native sleep for **1ns, 1µs & 1ms** and see how long it actually takes.
 use std::time::{Duration, Instant};
 
 fn main() {
@@ -7,15 +7,31 @@ fn main() {
         std::process::exit(1);
     }
 
+    if std::env::args().nth(1).as_deref() == Some("load") {
+        let cpus = std::thread::available_parallelism().unwrap().into();
+        eprintln!("Simulating {cpus} thread load");
+        for _ in 0..cpus {
+            std::thread::spawn(|| {
+                use rand::Rng;
+                let mut rng = rand::thread_rng();
+                while rng.gen::<u64>() > 0 {}
+            });
+        }
+
+        std::thread::sleep(Duration::from_secs(1));
+    }
+
+    eprintln!("==> sleep 1ns");
+
     const ITS: u32 = 1000;
 
-    let mut best = Duration::from_secs(100);
-    let mut sum = Duration::from_secs(0);
-    let mut worst = Duration::from_secs(0);
+    let mut best = Duration::MAX;
+    let mut sum = Duration::ZERO;
+    let mut worst = Duration::ZERO;
 
     for _ in 0..ITS {
         let before = Instant::now();
-        spin_sleep::native_sleep(Duration::new(0, 1));
+        spin_sleep::native_sleep(Duration::from_nanos(1));
         let elapsed = before.elapsed();
         sum += elapsed;
         if elapsed < best {
@@ -27,7 +43,55 @@ fn main() {
     }
 
     println!(
-        "average: {:?}, best : {best:?}, worst: {worst:?}",
+        "average: {:.1?}, best: {best:.1?}, worst: {worst:.1?}",
         sum / ITS
     );
+
+    eprintln!("==> sleep 1µs");
+
+    let mut best = Duration::MAX;
+    let mut sum = Duration::ZERO;
+    let mut worst = Duration::ZERO;
+
+    for _ in 0..ITS {
+        let before = Instant::now();
+        spin_sleep::native_sleep(Duration::from_micros(1));
+        let elapsed = before.elapsed();
+        sum += elapsed;
+        if elapsed < best {
+            best = elapsed;
+        }
+        if elapsed > worst {
+            worst = elapsed;
+        }
+    }
+
+    println!(
+        "average: {:.1?}, best: {best:.1?}, worst: {worst:.1?}",
+        sum / ITS
+    );
+
+    eprintln!("==> sleep 1ms");
+
+    let mut best = Duration::MAX;
+    let mut sum = Duration::ZERO;
+    let mut worst = Duration::ZERO;
+
+    for _ in 0..50 {
+        let before = Instant::now();
+        spin_sleep::native_sleep(Duration::from_millis(1));
+        let elapsed = before.elapsed();
+        sum += elapsed;
+        if elapsed < best {
+            best = elapsed;
+        }
+        if elapsed > worst {
+            worst = elapsed;
+        }
+    }
+
+    println!(
+        "average: {:.3?}, best: {best:.3?}, worst: {worst:.3?}",
+        sum / 50
+    );
 }
diff --git a/src/lib.rs b/src/lib.rs
@@ -81,8 +81,9 @@ const DEFAULT_NATIVE_SLEEP_ACCURACY: SubsecondNanoseconds = 125_000;
 /// **Does not spin.**
 ///
 /// Equivalent to [`std::thread::sleep`], with the following exceptions:
-/// * **Windows**: Automatically selects the best native sleep accuracy generally achieving ~1ms
-/// native sleep accuracy, instead of default ~16ms.
+/// * **Windows** (>= Windows 10, version 1803): Uses a high resolution waitable timer, similar to std in rust >= 1.75.
+/// * **Windows** (< Windows 10, version 1803): Automatically selects the best native sleep accuracy
+///   generally achieving ~1ms native sleep accuracy, instead of default ~16ms.
 #[inline]
 pub fn native_sleep(duration: Duration) {
     #[cfg(windows)]
@@ -97,7 +98,7 @@ impl Default for SpinSleeper {
     #[inline]
     fn default() -> Self {
         #[cfg(windows)]
-        let accuracy = windows::min_time_period() * 1_000_000;
+        let accuracy = windows::sleep_accuracy();
         #[cfg(not(windows))]
         let accuracy = DEFAULT_NATIVE_SLEEP_ACCURACY;