diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0478a06..44cd462 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,6 @@
 # Unreleased (v1.2.0)
+* Windows: Use a high resolution waitable timer when available (>= Windows 10, version 1803).
+* Windows: Replace _winapi_ with _windows-sys_ dependency.
 * Windows: Remove _once_cell_ dependency.
 
 # v1.1.1
diff --git a/Cargo.toml b/Cargo.toml
index f0bea2d..28b767d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,7 +10,9 @@ license = "Apache-2.0"
 readme="README.md"
 
 [target.'cfg(windows)'.dependencies]
-winapi = { version = "0.3", features = ["minwindef", "mmsystem", "timeapi"] }
+windows-sys = { version = "0.52", features = [
+    "Win32_Foundation", "Win32_Security", "Win32_System", "Win32_System_Threading", "Win32_Media"
+] }
 
 [dev-dependencies]
 approx = "0.5"
diff --git a/README.md b/README.md
index 67a8114..df0c019 100644
--- a/README.md
+++ b/README.md
@@ -47,8 +47,10 @@ let sleeper = SpinSleeper::default();
 ```
 
 ### Windows Accuracy
-Windows has particularly poor accuracy by default (~15ms), `spin_sleep` will automatically
-select the best accuracy on Windows generally achieving ~1ms native sleep accuracy *(Since 0.3.3)*.
+Windows (>= Windows 10, version 1803) will use a high resolution waitable timer, similar to sleep in rust std >= 1.75.
+
+Earlier versions of Windows have particularly poor accuracy by default (~15ms), `spin_sleep` will automatically
+select the best accuracy on Windows generally achieving ~1-2ms native sleep accuracy.
 
 ## Minimum supported rust compiler
 This crate is maintained with [latest stable rust](https://gist.github.com/alexheretic/d1e98d8433b602e57f5d0a9637927e0c).
diff --git a/experiments/README.md b/experiments/README.md
index e4c5338..94b978c 100644
--- a/experiments/README.md
+++ b/experiments/README.md
@@ -3,7 +3,7 @@ Experiments to measure latency all machine specific & non-deterministic but are
 good default settings for _spin_sleep_.
 
 ## native_sleep_accuracy
-Call OS native sleep for **1ns** and see how long it actually takes.
+Call OS native sleep for **1ns, 1µs & 1ms** and see how long it actually takes.
 
 ```sh
 cargo run --bin native_sleep_accuracy --release
@@ -11,12 +11,52 @@ cargo run --bin native_sleep_accuracy --release
 
 **Linux example output** *
 ```
-average: 53.04µs, best : 7.95µs, worst: 85.238µs
+==> sleep 1ns
+average: 54.0µs, best: 8.7µs, worst: 94.1µs
+==> sleep 1µs
+average: 55.1µs, best: 8.3µs, worst: 60.4µs
+==> sleep 1ms
+average: 1.055ms, best: 1.054ms, worst: 1.058ms
 ```
 
 **Windows example output** *
 ```
-average: 2.012432ms, best : 2.0069ms, worst: 2.1455ms
+==> sleep 1ns
+average: 2.0µs, best: 1.3µs, worst: 13.9µs
+==> sleep 1µs
+average: 446.7µs, best: 2.3µs, worst: 725.8µs
+==> sleep 1ms
+average: 1.775ms, best: 1.502ms, worst: 2.012ms
+```
+
+### Under high load
+Do the same measurement as above but while all cores are being stressed.
+```sh
+cargo run --bin native_sleep_accuracy --release -- load
+```
+
+**Linux example output** *
+Generally similar to no load, but more likely to produce occasional high latency.
+```
+Simulating 16 thread load
+==> sleep 1ns
+average: 53.8µs, best: 7.1µs, worst: 231.6µs
+==> sleep 1µs
+average: 58.0µs, best: 7.6µs, worst: 3.3ms
+==> sleep 1ms
+average: 1.054ms, best: 1.054ms, worst: 1.055ms
+```
+
+**Windows example output** *
+High latency is fairly common.
+```
+Simulating 16 thread load
+==> sleep 1ns
+average: 39.3µs, best: 1.8µs, worst: 36.8ms
+==> sleep 1µs
+average: 14.9ms, best: 2.1µs, worst: 46.9ms
+==> sleep 1ms
+average: 16.025ms, best: 2.004ms, worst: 30.071ms
 ```
 
 ## spin_strategy_latency
@@ -30,38 +70,38 @@ cargo run --bin spin_strategy_latency --release
 **Linux example output** *
 ```
 warming up...
-5ms    None          avg-spins: 191610   avg-actual: 5.000044ms
-5ms    SpinLoopHint  avg-spins: 176594   avg-actual: 5.000045ms
-5ms    YieldThread   avg-spins: 38366    avg-actual: 5.000105ms
-900µs  None          avg-spins: 34340    avg-actual: 900.05µs
-900µs  SpinLoopHint  avg-spins: 31633    avg-actual: 900.052µs
-900µs  YieldThread   avg-spins: 6843     avg-actual: 900.104µs
-5µs    None          avg-spins: 186      avg-actual: 5.04µs
-5µs    SpinLoopHint  avg-spins: 173      avg-actual: 5.048µs
-5µs    YieldThread   avg-spins: 38       avg-actual: 5.075µs
-100ns  None          avg-spins: 3        avg-actual: 135ns
+5ms    None          avg-spins: 231633   avg-actual: 5.000039ms
+5ms    SpinLoopHint  avg-spins: 168571   avg-actual: 5.000041ms
+5ms    YieldThread   avg-spins: 8431     avg-actual: 5.000323ms
+900µs  None          avg-spins: 41194    avg-actual: 900.039µs
+900µs  SpinLoopHint  avg-spins: 30094    avg-actual: 900.044µs
+900µs  YieldThread   avg-spins: 1527     avg-actual: 900.349µs
+5µs    None          avg-spins: 231      avg-actual: 5.033µs
+5µs    SpinLoopHint  avg-spins: 167      avg-actual: 5.063µs
+5µs    YieldThread   avg-spins: 9        avg-actual: 5.229µs
+100ns  None          avg-spins: 4        avg-actual: 129ns
 100ns  SpinLoopHint  avg-spins: 3        avg-actual: 132ns
-100ns  YieldThread   avg-spins: 1        avg-actual: 181ns
+100ns  YieldThread   avg-spins: 1        avg-actual: 625ns
 ```
 
 **Windows example output** *
 ```
 warming up...
-5ms    None          avg-spins: 158591   avg-actual: 5ms
-5ms    SpinLoopHint  avg-spins: 134568   avg-actual: 5ms
-5ms    YieldThread   avg-spins: 50380    avg-actual: 5.000039ms
-900µs  None          avg-spins: 28491    avg-actual: 900µs
-900µs  SpinLoopHint  avg-spins: 24128    avg-actual: 900.002µs
-900µs  YieldThread   avg-spins: 9070     avg-actual: 900.033µs
-5µs    None          avg-spins: 155      avg-actual: 5µs
-5µs    SpinLoopHint  avg-spins: 133      avg-actual: 5µs
-5µs    YieldThread   avg-spins: 49       avg-actual: 5.042µs
+5ms    None          avg-spins: 176820   avg-actual: 5ms
+5ms    SpinLoopHint  avg-spins: 164060   avg-actual: 5ms
+5ms    YieldThread   avg-spins: 31789    avg-actual: 5.000064ms
+900µs  None          avg-spins: 31791    avg-actual: 900µs
+900µs  SpinLoopHint  avg-spins: 29406    avg-actual: 900.021µs
+900µs  YieldThread   avg-spins: 5700     avg-actual: 900.063µs
+5µs    None          avg-spins: 139      avg-actual: 5µs
+5µs    SpinLoopHint  avg-spins: 160      avg-actual: 5µs
+5µs    YieldThread   avg-spins: 31       avg-actual: 5.09µs
 100ns  None          avg-spins: 0        avg-actual: 100ns
 100ns  SpinLoopHint  avg-spins: 0        avg-actual: 100ns
-100ns  YieldThread   avg-spins: 1        avg-actual: 102ns
+100ns  YieldThread   avg-spins: 0        avg-actual: 172ns
 ```
 
-## spin_strategy_latency under load
+### Under high load
 Do the same measurement as above but while all cores are being stressed.
 
 ```sh
@@ -72,36 +112,36 @@ cargo run --bin spin_strategy_latency --release -- load
 ```
 Simulating 16 thread load
 warming up...
-5ms    None          avg-spins: 159018   avg-actual: 5.000058ms
-5ms    SpinLoopHint  avg-spins: 122263   avg-actual: 5.000065ms
-5ms    YieldThread   avg-spins: 23265    avg-actual: 5.000327ms
-900µs  None          avg-spins: 27748    avg-actual: 938.427µs
-900µs  SpinLoopHint  avg-spins: 21727    avg-actual: 900.062µs
-900µs  YieldThread   avg-spins: 4054     avg-actual: 901.31µs
-5µs    None          avg-spins: 157      avg-actual: 5.055µs
-5µs    SpinLoopHint  avg-spins: 122      avg-actual: 5.057µs
-5µs    YieldThread   avg-spins: 23       avg-actual: 5.07µs
-100ns  None          avg-spins: 2        avg-actual: 147ns
-100ns  SpinLoopHint  avg-spins: 1        avg-actual: 135ns
-100ns  YieldThread   avg-spins: 1        avg-actual: 278ns
+5ms    None          avg-spins: 170998   avg-actual: 5.374337ms
+5ms    SpinLoopHint  avg-spins: 110830   avg-actual: 5.385263ms
+5ms    YieldThread   avg-spins: 6457     avg-actual: 5.000448ms
+900µs  None          avg-spins: 34035    avg-actual: 900.045µs
+900µs  SpinLoopHint  avg-spins: 21661    avg-actual: 900.051µs
+900µs  YieldThread   avg-spins: 1132     avg-actual: 900.54µs
+5µs    None          avg-spins: 186      avg-actual: 5.18µs
+5µs    SpinLoopHint  avg-spins: 117      avg-actual: 5.124µs
+5µs    YieldThread   avg-spins: 6        avg-actual: 5.621µs
+100ns  None          avg-spins: 3        avg-actual: 128ns
+100ns  SpinLoopHint  avg-spins: 2        avg-actual: 131ns
+100ns  YieldThread   avg-spins: 1        avg-actual: 898ns
 ```
 
 **Windows example output** *
 ```
 Simulating 16 thread load
 warming up...
-5ms    None          avg-spins: 105568   avg-actual: 5.838449ms
-5ms    SpinLoopHint  avg-spins: 79548    avg-actual: 5.608363ms
-5ms    YieldThread   avg-spins: 1        avg-actual: 17.526351ms
-900µs  None          avg-spins: 19461    avg-actual: 1.127537ms
-900µs  SpinLoopHint  avg-spins: 14578    avg-actual: 1.326708ms
-900µs  YieldThread   avg-spins: 1        avg-actual: 17.526448ms
-5µs    None          avg-spins: 108      avg-actual: 5µs
-5µs    SpinLoopHint  avg-spins: 79       avg-actual: 6.298µs
-5µs    YieldThread   avg-spins: 1        avg-actual: 11.417271ms
-100ns  None          avg-spins: 1        avg-actual: 101ns
-100ns  SpinLoopHint  avg-spins: 0        avg-actual: 102ns
-100ns  YieldThread   avg-spins: 0        avg-actual: 7.716038ms
-```
-
-\* _Measured 2022-02-18 with a AMD 5800X_.
+5ms    None          avg-spins: 140709   avg-actual: 5.604986ms
+5ms    SpinLoopHint  avg-spins: 108241   avg-actual: 5.81583ms
+5ms    YieldThread   avg-spins: 3        avg-actual: 32.039572ms
+900µs  None          avg-spins: 27701    avg-actual: 902.595µs
+900µs  SpinLoopHint  avg-spins: 20202    avg-actual: 1.210891ms
+900µs  YieldThread   avg-spins: 1        avg-actual: 11.297962ms
+5µs    None          avg-spins: 153      avg-actual: 5µs
+5µs    SpinLoopHint  avg-spins: 110      avg-actual: 5µs
+5µs    YieldThread   avg-spins: 1        avg-actual: 13.948654ms
+100ns  None          avg-spins: 0        avg-actual: 100ns
+100ns  SpinLoopHint  avg-spins: 0        avg-actual: 100ns
+100ns  YieldThread   avg-spins: 0        avg-actual: 2.882577ms
+```
+
+\* _Measured 2023-01-02 with a AMD 5800X_.
diff --git a/experiments/src/bin/native_sleep_accuracy.rs b/experiments/src/bin/native_sleep_accuracy.rs
index 7d9449e..cf51d25 100644
--- a/experiments/src/bin/native_sleep_accuracy.rs
+++ b/experiments/src/bin/native_sleep_accuracy.rs
@@ -1,4 +1,4 @@
-//! Call OS native sleep for **1ns** and see how long it actually takes.
+//! Call OS native sleep for **1ns, 1µs & 1ms** and see how long it actually takes.
 use std::time::{Duration, Instant};
 
 fn main() {
@@ -7,15 +7,31 @@ fn main() {
         std::process::exit(1);
     }
 
+    if std::env::args().nth(1).as_deref() == Some("load") {
+        let cpus = std::thread::available_parallelism().unwrap().into();
+        eprintln!("Simulating {cpus} thread load");
+        for _ in 0..cpus {
+            std::thread::spawn(|| {
+                use rand::Rng;
+                let mut rng = rand::thread_rng();
+                while rng.gen::<u64>() > 0 {}
+            });
+        }
+
+        std::thread::sleep(Duration::from_secs(1));
+    }
+
+    eprintln!("==> sleep 1ns");
+
     const ITS: u32 = 1000;
 
-    let mut best = Duration::from_secs(100);
-    let mut sum = Duration::from_secs(0);
-    let mut worst = Duration::from_secs(0);
+    let mut best = Duration::MAX;
+    let mut sum = Duration::ZERO;
+    let mut worst = Duration::ZERO;
 
     for _ in 0..ITS {
         let before = Instant::now();
-        spin_sleep::native_sleep(Duration::new(0, 1));
+        spin_sleep::native_sleep(Duration::from_nanos(1));
         let elapsed = before.elapsed();
         sum += elapsed;
         if elapsed < best {
@@ -27,7 +43,55 @@ fn main() {
     }
 
     println!(
-        "average: {:?}, best : {best:?}, worst: {worst:?}",
+        "average: {:.1?}, best: {best:.1?}, worst: {worst:.1?}",
         sum / ITS
     );
+
+    eprintln!("==> sleep 1µs");
+
+    let mut best = Duration::MAX;
+    let mut sum = Duration::ZERO;
+    let mut worst = Duration::ZERO;
+
+    for _ in 0..ITS {
+        let before = Instant::now();
+        spin_sleep::native_sleep(Duration::from_micros(1));
+        let elapsed = before.elapsed();
+        sum += elapsed;
+        if elapsed < best {
+            best = elapsed;
+        }
+        if elapsed > worst {
+            worst = elapsed;
+        }
+    }
+
+    println!(
+        "average: {:.1?}, best: {best:.1?}, worst: {worst:.1?}",
+        sum / ITS
+    );
+
+    eprintln!("==> sleep 1ms");
+
+    let mut best = Duration::MAX;
+    let mut sum = Duration::ZERO;
+    let mut worst = Duration::ZERO;
+
+    for _ in 0..50 {
+        let before = Instant::now();
+        spin_sleep::native_sleep(Duration::from_millis(1));
+        let elapsed = before.elapsed();
+        sum += elapsed;
+        if elapsed < best {
+            best = elapsed;
+        }
+        if elapsed > worst {
+            worst = elapsed;
+        }
+    }
+
+    println!(
+        "average: {:.3?}, best: {best:.3?}, worst: {worst:.3?}",
+        sum / 50
+    );
 }
diff --git a/src/lib.rs b/src/lib.rs
index bbd74d8..9439d2f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -81,8 +81,9 @@ const DEFAULT_NATIVE_SLEEP_ACCURACY: SubsecondNanoseconds = 125_000;
 /// **Does not spin.**
 ///
 /// Equivalent to [`std::thread::sleep`], with the following exceptions:
-/// * **Windows**: Automatically selects the best native sleep accuracy generally achieving ~1ms
-/// native sleep accuracy, instead of default ~16ms.
+/// * **Windows** (>= Windows 10, version 1803): Uses a high resolution waitable timer, similar to std in rust >= 1.75.
+/// * **Windows** (< Windows 10, version 1803): Automatically selects the best native sleep accuracy
+///   generally achieving ~1ms native sleep accuracy, instead of default ~16ms.
 #[inline]
 pub fn native_sleep(duration: Duration) {
     #[cfg(windows)]
@@ -97,7 +98,7 @@ impl Default for SpinSleeper {
     #[inline]
     fn default() -> Self {
         #[cfg(windows)]
-        let accuracy = windows::min_time_period() * 1_000_000;
+        let accuracy = windows::sleep_accuracy();
         #[cfg(not(windows))]
         let accuracy = DEFAULT_NATIVE_SLEEP_ACCURACY;
 
diff --git a/src/windows.rs b/src/windows.rs
index be3a60e..cc61078 100644
--- a/src/windows.rs
+++ b/src/windows.rs
@@ -1,24 +1,42 @@
-use std::{mem, sync::OnceLock, time::Duration};
-use winapi::{
-    shared::minwindef::UINT,
-    um::{
-        mmsystem::{TIMECAPS, TIMERR_NOERROR},
-        timeapi::{timeBeginPeriod, timeEndPeriod, timeGetDevCaps},
+use std::{mem, ops::Neg, ptr::null, sync::OnceLock, time::Duration};
+use windows_sys::Win32::{
+    Foundation::{CloseHandle, FALSE},
+    Media::{timeBeginPeriod, timeEndPeriod, timeGetDevCaps, TIMECAPS, TIMERR_NOERROR},
+    System::Threading::{
+        CreateWaitableTimerExW, SetWaitableTimer, WaitForSingleObject,
+        CREATE_WAITABLE_TIMER_HIGH_RESOLUTION, INFINITE, TIMER_ALL_ACCESS,
     },
 };
 
 #[inline]
 pub fn native_sleep(duration: Duration) {
-    let min_time_period = min_time_period();
-    unsafe {
-        timeBeginPeriod(min_time_period);
-        std::thread::sleep(duration);
-        timeEndPeriod(min_time_period);
+    if high_res_sleep(&duration).is_err() {
+        // fallback for OS earlier than Windows 10, version 1803.
+        let min_time_period = min_time_period();
+        unsafe {
+            timeBeginPeriod(min_time_period);
+            std::thread::sleep(duration);
+            timeEndPeriod(min_time_period);
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn sleep_accuracy() -> u32 {
+    if HIGH_RES_TIMER.with(|t| t.is_ok()) {
+        // high resolution timer is fast on average but has high maximums
+        // e.g. `average: 154.7µs, best: 2.6µs, worst: 729.5µs`
+        //
+        // 500-1000µs accuracy should mostly eliminate over-sleeps except under load.
+        700_000
+    } else {
+        min_time_period() * 1_000_000
     }
 }
 
-pub(crate) fn min_time_period() -> UINT {
-    static MIN_TIME_PERIOD: OnceLock<UINT> = OnceLock::new();
+/// Minimum time period for use with `timeBeginPeriod` & `timeEndPeriod`.
+fn min_time_period() -> u32 {
+    static MIN_TIME_PERIOD: OnceLock<u32> = OnceLock::new();
 
     *MIN_TIME_PERIOD.get_or_init(|| unsafe {
         let tc_size = mem::size_of::<TIMECAPS>() as u32;
@@ -34,3 +52,73 @@ pub(crate) fn min_time_period() -> UINT {
         }
     })
 }
+
+thread_local! {
+    static HIGH_RES_TIMER: Result<WaitableTimer, ()> = WaitableTimer::try_high_resolution();
+}
+
+#[inline]
+fn high_res_sleep(duration: &Duration) -> Result<(), ()> {
+    HIGH_RES_TIMER.with(|timer| {
+        let timer = timer.as_ref().map_err(|_| ())?;
+        timer.set(duration)?;
+        timer.wait()
+    })
+}
+
+struct WaitableTimer {
+    handle: windows_sys::Win32::Foundation::HANDLE,
+}
+
+impl WaitableTimer {
+    /// Create a high-resolution timer. Will fail before Windows 10, version 1803.
+    fn try_high_resolution() -> Result<Self, ()> {
+        let handle = unsafe {
+            CreateWaitableTimerExW(
+                null(),
+                null(),
+                CREATE_WAITABLE_TIMER_HIGH_RESOLUTION,
+                TIMER_ALL_ACCESS,
+            )
+        };
+        match handle {
+            0 => Err(()),
+            _ => Ok(Self { handle }),
+        }
+    }
+
+    fn set(&self, duration: &Duration) -> Result<(), ()> {
+        // Convert the Duration to a format similar to FILETIME.
+        // Negative values are relative times whereas positive values are absolute.
+        // Therefore we negate the relative duration.
+        let time = checked_dur2intervals(duration).ok_or(())?.neg();
+        match unsafe { SetWaitableTimer(self.handle, &time, 0, None, null(), FALSE) } {
+            0 => Err(()),
+            _ => Ok(()),
+        }
+    }
+
+    fn wait(&self) -> Result<(), ()> {
+        match unsafe { WaitForSingleObject(self.handle, INFINITE) } {
+            windows_sys::Win32::Foundation::WAIT_FAILED => Err(()),
+            _ => Ok(()),
+        }
+    }
+}
+
+impl Drop for WaitableTimer {
+    fn drop(&mut self) {
+        unsafe { CloseHandle(self.handle) };
+    }
+}
+
+fn checked_dur2intervals(dur: &Duration) -> Option<i64> {
+    const NANOS_PER_SEC: u64 = 1_000_000_000;
+    const INTERVALS_PER_SEC: u64 = NANOS_PER_SEC / 100;
+
+    dur.as_secs()
+        .checked_mul(INTERVALS_PER_SEC)?
+        .checked_add(dur.subsec_nanos() as u64 / 100)?
+        .try_into()
+        .ok()
+}