diff --git a/CHANGELOG.md b/CHANGELOG.md index 0478a06..44cd462 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,6 @@ # Unreleased (v1.2.0) +* Windows: Use a high resolution waitable timer when available (>= Windows 10, version 1803). +* Windows: Replace _winapi_ with _windows-sys_ dependency. * Windows: Remove _once_cell_ dependency. # v1.1.1 diff --git a/Cargo.toml b/Cargo.toml index f0bea2d..28b767d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,9 @@ license = "Apache-2.0" readme="README.md" [target.'cfg(windows)'.dependencies] -winapi = { version = "0.3", features = ["minwindef", "mmsystem", "timeapi"] } +windows-sys = { version = "0.52", features = [ + "Win32_Foundation", "Win32_Security", "Win32_System", "Win32_System_Threading", "Win32_Media" +] } [dev-dependencies] approx = "0.5" diff --git a/README.md b/README.md index 67a8114..df0c019 100644 --- a/README.md +++ b/README.md @@ -47,8 +47,10 @@ let sleeper = SpinSleeper::default(); ``` ### Windows Accuracy -Windows has particularly poor accuracy by default (~15ms), `spin_sleep` will automatically -select the best accuracy on Windows generally achieving ~1ms native sleep accuracy *(Since 0.3.3)*. +Windows (>= Windows 10, version 1803) will use a high resolution waitable timer, similar to sleep in rust std >= 1.75. + +Earlier versions of Windows have particularly poor accuracy by default (~15ms), `spin_sleep` will automatically +select the best accuracy on Windows generally achieving ~1-2ms native sleep accuracy. ## Minimum supported rust compiler This crate is maintained with [latest stable rust](https://gist.github.com/alexheretic/d1e98d8433b602e57f5d0a9637927e0c). diff --git a/experiments/README.md b/experiments/README.md index e4c5338..94b978c 100644 --- a/experiments/README.md +++ b/experiments/README.md @@ -3,7 +3,7 @@ Experiments to measure latency all machine specific & non-deterministic but are good default settings for _spin_sleep_. ## native_sleep_accuracy -Call OS native sleep for **1ns** and see how long it actually takes. +Call OS native sleep for **1ns, 1µs & 1ms** and see how long it actually takes. ```sh cargo run --bin native_sleep_accuracy --release @@ -11,12 +11,52 @@ cargo run --bin native_sleep_accuracy --release **Linux example output** * ``` -average: 53.04µs, best : 7.95µs, worst: 85.238µs +==> sleep 1ns +average: 54.0µs, best: 8.7µs, worst: 94.1µs +==> sleep 1µs +average: 55.1µs, best: 8.3µs, worst: 60.4µs +==> sleep 1ms +average: 1.055ms, best: 1.054ms, worst: 1.058ms ``` **Windows example output** * ``` -average: 2.012432ms, best : 2.0069ms, worst: 2.1455ms +==> sleep 1ns +average: 2.0µs, best: 1.3µs, worst: 13.9µs +==> sleep 1µs +average: 446.7µs, best: 2.3µs, worst: 725.8µs +==> sleep 1ms +average: 1.775ms, best: 1.502ms, worst: 2.012ms +``` + +### Under high load +Do the same measurement as above but while all cores are being stressed. +```sh +cargo run --bin native_sleep_accuracy --release -- load +``` + +**Linux example output** * +Generally similar to no load, but more likely to produce occasional high latency. +``` +Simulating 16 thread load +==> sleep 1ns +average: 53.8µs, best: 7.1µs, worst: 231.6µs +==> sleep 1µs +average: 58.0µs, best: 7.6µs, worst: 3.3ms +==> sleep 1ms +average: 1.054ms, best: 1.054ms, worst: 1.055ms +``` + +**Windows example output** * +High latency is fairly common. +``` +Simulating 16 thread load +==> sleep 1ns +average: 39.3µs, best: 1.8µs, worst: 36.8ms +==> sleep 1µs +average: 14.9ms, best: 2.1µs, worst: 46.9ms +==> sleep 1ms +average: 16.025ms, best: 2.004ms, worst: 30.071ms ``` ## spin_strategy_latency @@ -30,38 +70,38 @@ cargo run --bin spin_strategy_latency --release **Linux example output** * ``` warming up... -5ms None avg-spins: 191610 avg-actual: 5.000044ms -5ms SpinLoopHint avg-spins: 176594 avg-actual: 5.000045ms -5ms YieldThread avg-spins: 38366 avg-actual: 5.000105ms -900µs None avg-spins: 34340 avg-actual: 900.05µs -900µs SpinLoopHint avg-spins: 31633 avg-actual: 900.052µs -900µs YieldThread avg-spins: 6843 avg-actual: 900.104µs -5µs None avg-spins: 186 avg-actual: 5.04µs -5µs SpinLoopHint avg-spins: 173 avg-actual: 5.048µs -5µs YieldThread avg-spins: 38 avg-actual: 5.075µs -100ns None avg-spins: 3 avg-actual: 135ns +5ms None avg-spins: 231633 avg-actual: 5.000039ms +5ms SpinLoopHint avg-spins: 168571 avg-actual: 5.000041ms +5ms YieldThread avg-spins: 8431 avg-actual: 5.000323ms +900µs None avg-spins: 41194 avg-actual: 900.039µs +900µs SpinLoopHint avg-spins: 30094 avg-actual: 900.044µs +900µs YieldThread avg-spins: 1527 avg-actual: 900.349µs +5µs None avg-spins: 231 avg-actual: 5.033µs +5µs SpinLoopHint avg-spins: 167 avg-actual: 5.063µs +5µs YieldThread avg-spins: 9 avg-actual: 5.229µs +100ns None avg-spins: 4 avg-actual: 129ns 100ns SpinLoopHint avg-spins: 3 avg-actual: 132ns -100ns YieldThread avg-spins: 1 avg-actual: 181ns +100ns YieldThread avg-spins: 1 avg-actual: 625ns ``` **Windows example output** * ``` warming up... -5ms None avg-spins: 158591 avg-actual: 5ms -5ms SpinLoopHint avg-spins: 134568 avg-actual: 5ms -5ms YieldThread avg-spins: 50380 avg-actual: 5.000039ms -900µs None avg-spins: 28491 avg-actual: 900µs -900µs SpinLoopHint avg-spins: 24128 avg-actual: 900.002µs -900µs YieldThread avg-spins: 9070 avg-actual: 900.033µs -5µs None avg-spins: 155 avg-actual: 5µs -5µs SpinLoopHint avg-spins: 133 avg-actual: 5µs -5µs YieldThread avg-spins: 49 avg-actual: 5.042µs +5ms None avg-spins: 176820 avg-actual: 5ms +5ms SpinLoopHint avg-spins: 164060 avg-actual: 5ms +5ms YieldThread avg-spins: 31789 avg-actual: 5.000064ms +900µs None avg-spins: 31791 avg-actual: 900µs +900µs SpinLoopHint avg-spins: 29406 avg-actual: 900.021µs +900µs YieldThread avg-spins: 5700 avg-actual: 900.063µs +5µs None avg-spins: 139 avg-actual: 5µs +5µs SpinLoopHint avg-spins: 160 avg-actual: 5µs +5µs YieldThread avg-spins: 31 avg-actual: 5.09µs 100ns None avg-spins: 0 avg-actual: 100ns 100ns SpinLoopHint avg-spins: 0 avg-actual: 100ns -100ns YieldThread avg-spins: 1 avg-actual: 102ns +100ns YieldThread avg-spins: 0 avg-actual: 172ns ``` -## spin_strategy_latency under load +### Under high load Do the same measurement as above but while all cores are being stressed. ```sh @@ -72,36 +112,36 @@ cargo run --bin spin_strategy_latency --release -- load ``` Simulating 16 thread load warming up... -5ms None avg-spins: 159018 avg-actual: 5.000058ms -5ms SpinLoopHint avg-spins: 122263 avg-actual: 5.000065ms -5ms YieldThread avg-spins: 23265 avg-actual: 5.000327ms -900µs None avg-spins: 27748 avg-actual: 938.427µs -900µs SpinLoopHint avg-spins: 21727 avg-actual: 900.062µs -900µs YieldThread avg-spins: 4054 avg-actual: 901.31µs -5µs None avg-spins: 157 avg-actual: 5.055µs -5µs SpinLoopHint avg-spins: 122 avg-actual: 5.057µs -5µs YieldThread avg-spins: 23 avg-actual: 5.07µs -100ns None avg-spins: 2 avg-actual: 147ns -100ns SpinLoopHint avg-spins: 1 avg-actual: 135ns -100ns YieldThread avg-spins: 1 avg-actual: 278ns +5ms None avg-spins: 170998 avg-actual: 5.374337ms +5ms SpinLoopHint avg-spins: 110830 avg-actual: 5.385263ms +5ms YieldThread avg-spins: 6457 avg-actual: 5.000448ms +900µs None avg-spins: 34035 avg-actual: 900.045µs +900µs SpinLoopHint avg-spins: 21661 avg-actual: 900.051µs +900µs YieldThread avg-spins: 1132 avg-actual: 900.54µs +5µs None avg-spins: 186 avg-actual: 5.18µs +5µs SpinLoopHint avg-spins: 117 avg-actual: 5.124µs +5µs YieldThread avg-spins: 6 avg-actual: 5.621µs +100ns None avg-spins: 3 avg-actual: 128ns +100ns SpinLoopHint avg-spins: 2 avg-actual: 131ns +100ns YieldThread avg-spins: 1 avg-actual: 898ns ``` **Windows example output** * ``` Simulating 16 thread load warming up... -5ms None avg-spins: 105568 avg-actual: 5.838449ms -5ms SpinLoopHint avg-spins: 79548 avg-actual: 5.608363ms -5ms YieldThread avg-spins: 1 avg-actual: 17.526351ms -900µs None avg-spins: 19461 avg-actual: 1.127537ms -900µs SpinLoopHint avg-spins: 14578 avg-actual: 1.326708ms -900µs YieldThread avg-spins: 1 avg-actual: 17.526448ms -5µs None avg-spins: 108 avg-actual: 5µs -5µs SpinLoopHint avg-spins: 79 avg-actual: 6.298µs -5µs YieldThread avg-spins: 1 avg-actual: 11.417271ms -100ns None avg-spins: 1 avg-actual: 101ns -100ns SpinLoopHint avg-spins: 0 avg-actual: 102ns -100ns YieldThread avg-spins: 0 avg-actual: 7.716038ms -``` - -\* _Measured 2022-02-18 with a AMD 5800X_. +5ms None avg-spins: 140709 avg-actual: 5.604986ms +5ms SpinLoopHint avg-spins: 108241 avg-actual: 5.81583ms +5ms YieldThread avg-spins: 3 avg-actual: 32.039572ms +900µs None avg-spins: 27701 avg-actual: 902.595µs +900µs SpinLoopHint avg-spins: 20202 avg-actual: 1.210891ms +900µs YieldThread avg-spins: 1 avg-actual: 11.297962ms +5µs None avg-spins: 153 avg-actual: 5µs +5µs SpinLoopHint avg-spins: 110 avg-actual: 5µs +5µs YieldThread avg-spins: 1 avg-actual: 13.948654ms +100ns None avg-spins: 0 avg-actual: 100ns +100ns SpinLoopHint avg-spins: 0 avg-actual: 100ns +100ns YieldThread avg-spins: 0 avg-actual: 2.882577ms +``` + +\* _Measured 2023-01-02 with a AMD 5800X_. diff --git a/experiments/src/bin/native_sleep_accuracy.rs b/experiments/src/bin/native_sleep_accuracy.rs index 7d9449e..cf51d25 100644 --- a/experiments/src/bin/native_sleep_accuracy.rs +++ b/experiments/src/bin/native_sleep_accuracy.rs @@ -1,4 +1,4 @@ -//! Call OS native sleep for **1ns** and see how long it actually takes. +//! Call OS native sleep for **1ns, 1µs & 1ms** and see how long it actually takes. use std::time::{Duration, Instant}; fn main() { @@ -7,15 +7,31 @@ fn main() { std::process::exit(1); } + if std::env::args().nth(1).as_deref() == Some("load") { + let cpus = std::thread::available_parallelism().unwrap().into(); + eprintln!("Simulating {cpus} thread load"); + for _ in 0..cpus { + std::thread::spawn(|| { + use rand::Rng; + let mut rng = rand::thread_rng(); + while rng.gen::() > 0 {} + }); + } + + std::thread::sleep(Duration::from_secs(1)); + } + + eprintln!("==> sleep 1ns"); + const ITS: u32 = 1000; - let mut best = Duration::from_secs(100); - let mut sum = Duration::from_secs(0); - let mut worst = Duration::from_secs(0); + let mut best = Duration::MAX; + let mut sum = Duration::ZERO; + let mut worst = Duration::ZERO; for _ in 0..ITS { let before = Instant::now(); - spin_sleep::native_sleep(Duration::new(0, 1)); + spin_sleep::native_sleep(Duration::from_nanos(1)); let elapsed = before.elapsed(); sum += elapsed; if elapsed < best { @@ -27,7 +43,55 @@ fn main() { } println!( - "average: {:?}, best : {best:?}, worst: {worst:?}", + "average: {:.1?}, best: {best:.1?}, worst: {worst:.1?}", sum / ITS ); + + eprintln!("==> sleep 1µs"); + + let mut best = Duration::MAX; + let mut sum = Duration::ZERO; + let mut worst = Duration::ZERO; + + for _ in 0..ITS { + let before = Instant::now(); + spin_sleep::native_sleep(Duration::from_micros(1)); + let elapsed = before.elapsed(); + sum += elapsed; + if elapsed < best { + best = elapsed; + } + if elapsed > worst { + worst = elapsed; + } + } + + println!( + "average: {:.1?}, best: {best:.1?}, worst: {worst:.1?}", + sum / ITS + ); + + eprintln!("==> sleep 1ms"); + + let mut best = Duration::MAX; + let mut sum = Duration::ZERO; + let mut worst = Duration::ZERO; + + for _ in 0..50 { + let before = Instant::now(); + spin_sleep::native_sleep(Duration::from_millis(1)); + let elapsed = before.elapsed(); + sum += elapsed; + if elapsed < best { + best = elapsed; + } + if elapsed > worst { + worst = elapsed; + } + } + + println!( + "average: {:.3?}, best: {best:.3?}, worst: {worst:.3?}", + sum / 50 + ); } diff --git a/src/lib.rs b/src/lib.rs index bbd74d8..9439d2f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -81,8 +81,9 @@ const DEFAULT_NATIVE_SLEEP_ACCURACY: SubsecondNanoseconds = 125_000; /// **Does not spin.** /// /// Equivalent to [`std::thread::sleep`], with the following exceptions: -/// * **Windows**: Automatically selects the best native sleep accuracy generally achieving ~1ms -/// native sleep accuracy, instead of default ~16ms. +/// * **Windows** (>= Windows 10, version 1803): Uses a high resolution waitable timer, similar to std in rust >= 1.75. +/// * **Windows** (< Windows 10, version 1803): Automatically selects the best native sleep accuracy +/// generally achieving ~1ms native sleep accuracy, instead of default ~16ms. #[inline] pub fn native_sleep(duration: Duration) { #[cfg(windows)] @@ -97,7 +98,7 @@ impl Default for SpinSleeper { #[inline] fn default() -> Self { #[cfg(windows)] - let accuracy = windows::min_time_period() * 1_000_000; + let accuracy = windows::sleep_accuracy(); #[cfg(not(windows))] let accuracy = DEFAULT_NATIVE_SLEEP_ACCURACY; diff --git a/src/windows.rs b/src/windows.rs index be3a60e..cc61078 100644 --- a/src/windows.rs +++ b/src/windows.rs @@ -1,24 +1,42 @@ -use std::{mem, sync::OnceLock, time::Duration}; -use winapi::{ - shared::minwindef::UINT, - um::{ - mmsystem::{TIMECAPS, TIMERR_NOERROR}, - timeapi::{timeBeginPeriod, timeEndPeriod, timeGetDevCaps}, +use std::{mem, ops::Neg, ptr::null, sync::OnceLock, time::Duration}; +use windows_sys::Win32::{ + Foundation::{CloseHandle, FALSE}, + Media::{timeBeginPeriod, timeEndPeriod, timeGetDevCaps, TIMECAPS, TIMERR_NOERROR}, + System::Threading::{ + CreateWaitableTimerExW, SetWaitableTimer, WaitForSingleObject, + CREATE_WAITABLE_TIMER_HIGH_RESOLUTION, INFINITE, TIMER_ALL_ACCESS, }, }; #[inline] pub fn native_sleep(duration: Duration) { - let min_time_period = min_time_period(); - unsafe { - timeBeginPeriod(min_time_period); - std::thread::sleep(duration); - timeEndPeriod(min_time_period); + if high_res_sleep(&duration).is_err() { + // fallback for OS earlier than Windows 10, version 1803. + let min_time_period = min_time_period(); + unsafe { + timeBeginPeriod(min_time_period); + std::thread::sleep(duration); + timeEndPeriod(min_time_period); + } + } +} + +#[inline] +pub(crate) fn sleep_accuracy() -> u32 { + if HIGH_RES_TIMER.with(|t| t.is_ok()) { + // high resolution timer is fast on average but has high maximums + // e.g. `average: 154.7µs, best: 2.6µs, worst: 729.5µs` + // + // 500-1000µs accuracy should mostly eliminate over-sleeps except under load. + 700_000 + } else { + min_time_period() * 1_000_000 } } -pub(crate) fn min_time_period() -> UINT { - static MIN_TIME_PERIOD: OnceLock = OnceLock::new(); +/// Minimum time period for use with `timeBeginPeriod` & `timeEndPeriod`. +fn min_time_period() -> u32 { + static MIN_TIME_PERIOD: OnceLock = OnceLock::new(); *MIN_TIME_PERIOD.get_or_init(|| unsafe { let tc_size = mem::size_of::() as u32; @@ -34,3 +52,73 @@ pub(crate) fn min_time_period() -> UINT { } }) } + +thread_local! { + static HIGH_RES_TIMER: Result = WaitableTimer::try_high_resolution(); +} + +#[inline] +fn high_res_sleep(duration: &Duration) -> Result<(), ()> { + HIGH_RES_TIMER.with(|timer| { + let timer = timer.as_ref().map_err(|_| ())?; + timer.set(duration)?; + timer.wait() + }) +} + +struct WaitableTimer { + handle: windows_sys::Win32::Foundation::HANDLE, +} + +impl WaitableTimer { + /// Create a high-resolution timer. Will fail before Windows 10, version 1803. + fn try_high_resolution() -> Result { + let handle = unsafe { + CreateWaitableTimerExW( + null(), + null(), + CREATE_WAITABLE_TIMER_HIGH_RESOLUTION, + TIMER_ALL_ACCESS, + ) + }; + match handle { + 0 => Err(()), + _ => Ok(Self { handle }), + } + } + + fn set(&self, duration: &Duration) -> Result<(), ()> { + // Convert the Duration to a format similar to FILETIME. + // Negative values are relative times whereas positive values are absolute. + // Therefore we negate the relative duration. + let time = checked_dur2intervals(duration).ok_or(())?.neg(); + match unsafe { SetWaitableTimer(self.handle, &time, 0, None, null(), FALSE) } { + 0 => Err(()), + _ => Ok(()), + } + } + + fn wait(&self) -> Result<(), ()> { + match unsafe { WaitForSingleObject(self.handle, INFINITE) } { + windows_sys::Win32::Foundation::WAIT_FAILED => Err(()), + _ => Ok(()), + } + } +} + +impl Drop for WaitableTimer { + fn drop(&mut self) { + unsafe { CloseHandle(self.handle) }; + } +} + +fn checked_dur2intervals(dur: &Duration) -> Option { + const NANOS_PER_SEC: u64 = 1_000_000_000; + const INTERVALS_PER_SEC: u64 = NANOS_PER_SEC / 100; + + dur.as_secs() + .checked_mul(INTERVALS_PER_SEC)? + .checked_add(dur.subsec_nanos() as u64 / 100)? + .try_into() + .ok() +}