diff --git a/bencher_compat/benches/bencher_example.rs b/bencher_compat/benches/bencher_example.rs
index c45d246eb..5339836bd 100644
--- a/bencher_compat/benches/bencher_example.rs
+++ b/bencher_compat/benches/bencher_example.rs
@@ -1,22 +1,22 @@
-#[macro_use]
-extern crate criterion_bencher_compat;
-
-use criterion_bencher_compat::Bencher;
-
-fn a(bench: &mut Bencher) {
-    bench.iter(|| {
-        (0..1000).fold(0, |x, y| x + y)
-    })
-}
-
-fn b(bench: &mut Bencher) {
-    const N: usize = 1024;
-    bench.iter(|| {
-        vec![0u8; N]
-    });
-
-    bench.bytes = N as u64;
-}
-
-benchmark_group!(benches, a, b);
+#[macro_use]
+extern crate criterion_bencher_compat;
+
+use criterion_bencher_compat::Bencher;
+
+fn a(bench: &mut Bencher) {
+    bench.iter(|| {
+        (0..1000).fold(0, |x, y| x + y)
+    })
+}
+
+fn b(bench: &mut Bencher) {
+    const N: usize = 1024;
+    bench.iter(|| {
+        vec![0u8; N]
+    });
+
+    bench.bytes = N as u64;
+}
+
+benchmark_group!(benches, a, b);
 benchmark_main!(benches);
\ No newline at end of file
diff --git a/benches/benchmarks/sampling_mode.rs b/benches/benchmarks/sampling_mode.rs
index af761273a..c7ac7bfa8 100644
--- a/benches/benchmarks/sampling_mode.rs
+++ b/benches/benchmarks/sampling_mode.rs
@@ -1,26 +1,26 @@
-use criterion::{criterion_group, Criterion, SamplingMode};
-use std::thread::sleep;
-use std::time::Duration;
-
-fn sampling_mode_tests(c: &mut Criterion) {
-    let mut group = c.benchmark_group("sampling_mode");
-
-    group.sampling_mode(SamplingMode::Auto);
-    group.bench_function("Auto", |bencher| {
-        bencher.iter(|| sleep(Duration::from_millis(0)))
-    });
-
-    group.sampling_mode(SamplingMode::Linear);
-    group.bench_function("Linear", |bencher| {
-        bencher.iter(|| sleep(Duration::from_millis(0)))
-    });
-
-    group.sampling_mode(SamplingMode::Flat);
-    group.bench_function("Flat", |bencher| {
-        bencher.iter(|| sleep(Duration::from_millis(10)))
-    });
-
-    group.finish();
-}
-
-criterion_group!(benches, sampling_mode_tests,);
+use criterion::{criterion_group, Criterion, SamplingMode};
+use std::thread::sleep;
+use std::time::Duration;
+
+fn sampling_mode_tests(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sampling_mode");
+
+    group.sampling_mode(SamplingMode::Auto);
+    group.bench_function("Auto", |bencher| {
+        bencher.iter(|| sleep(Duration::from_millis(0)))
+    });
+
+    group.sampling_mode(SamplingMode::Linear);
+    group.bench_function("Linear", |bencher| {
+        bencher.iter(|| sleep(Duration::from_millis(0)))
+    });
+
+    group.sampling_mode(SamplingMode::Flat);
+    group.bench_function("Flat", |bencher| {
+        bencher.iter(|| sleep(Duration::from_millis(10)))
+    });
+
+    group.finish();
+}
+
+criterion_group!(benches, sampling_mode_tests,);
diff --git a/macro/benches/test_macro_bench.rs b/macro/benches/test_macro_bench.rs
index 40a3ab198..7369fd27b 100644
--- a/macro/benches/test_macro_bench.rs
+++ b/macro/benches/test_macro_bench.rs
@@ -1,27 +1,27 @@
-#![feature(custom_test_frameworks)]
-#![test_runner(criterion::runner)]
-
-use criterion::{Criterion, black_box};
-use criterion_macro::criterion;
-
-fn fibonacci(n: u64) -> u64 {
-    match n {
-        0 | 1 => 1,
-        n => fibonacci(n - 1) + fibonacci(n - 2),
-    }
-}
-
-fn custom_criterion() -> Criterion {
-    Criterion::default()
-        .sample_size(50)
-}
-
-#[criterion]
-fn bench_simple(c: &mut Criterion) {
-    c.bench_function("Fibonacci-Simple", |b| b.iter(|| fibonacci(black_box(10))));
-}
-
-#[criterion(custom_criterion())]
-fn bench_custom(c: &mut Criterion) {
-    c.bench_function("Fibonacci-Custom", |b| b.iter(|| fibonacci(black_box(20))));
+#![feature(custom_test_frameworks)]
+#![test_runner(criterion::runner)]
+
+use criterion::{Criterion, black_box};
+use criterion_macro::criterion;
+
+fn fibonacci(n: u64) -> u64 {
+    match n {
+        0 | 1 => 1,
+        n => fibonacci(n - 1) + fibonacci(n - 2),
+    }
+}
+
+fn custom_criterion() -> Criterion {
+    Criterion::default()
+        .sample_size(50)
+}
+
+#[criterion]
+fn bench_simple(c: &mut Criterion) {
+    c.bench_function("Fibonacci-Simple", |b| b.iter(|| fibonacci(black_box(10))));
+}
+
+#[criterion(custom_criterion())]
+fn bench_custom(c: &mut Criterion) {
+    c.bench_function("Fibonacci-Custom", |b| b.iter(|| fibonacci(black_box(20))));
 }
\ No newline at end of file
diff --git a/macro/src/lib.rs b/macro/src/lib.rs
index 6297a172e..360919362 100644
--- a/macro/src/lib.rs
+++ b/macro/src/lib.rs
@@ -1,56 +1,56 @@
-extern crate proc_macro;
-use proc_macro::TokenStream;
-use proc_macro2::{Ident, TokenTree};
-use quote::quote_spanned;
-
-#[proc_macro_attribute]
-pub fn criterion(attr: TokenStream, item: TokenStream) -> TokenStream {
-    let attr = proc_macro2::TokenStream::from(attr);
-    let item = proc_macro2::TokenStream::from(item);
-
-    let span = proc_macro2::Span::call_site();
-
-    let init = if stream_length(attr.clone()) != 0 {
-        attr
-    }
-    else {
-        quote_spanned!(span=> criterion::Criterion::default())
-    };
-
-    let function_name = find_name(item.clone());
-    let wrapped_name = Ident::new(&format!("criterion_wrapped_{}", function_name.to_string()), span);
-
-    let output = quote_spanned!(span=>
-        #[test_case]
-        pub fn #wrapped_name() {
-            #item
-
-            let mut c = #init.configure_from_args();
-            #function_name(&mut c);
-        }
-    );
-
-    output.into()
-}
-
-fn stream_length(stream: proc_macro2::TokenStream) -> usize {
-    stream.into_iter().count()
-}
-
-fn find_name(stream: proc_macro2::TokenStream) -> Ident {
-    let mut iter = stream.into_iter();
-    while let Some(tok) = iter.next() {
-        if let TokenTree::Ident(ident) = tok {
-            if ident == "fn" {
-                break;
-            }
-        }
-    }
-    
-    if let Some(TokenTree::Ident(name)) = iter.next() {
-        name
-    }
-    else {
-        panic!("Unable to find function name")
-    }
+extern crate proc_macro;
+use proc_macro::TokenStream;
+use proc_macro2::{Ident, TokenTree};
+use quote::quote_spanned;
+
+#[proc_macro_attribute]
+pub fn criterion(attr: TokenStream, item: TokenStream) -> TokenStream {
+    let attr = proc_macro2::TokenStream::from(attr);
+    let item = proc_macro2::TokenStream::from(item);
+
+    let span = proc_macro2::Span::call_site();
+
+    let init = if stream_length(attr.clone()) != 0 {
+        attr
+    }
+    else {
+        quote_spanned!(span=> criterion::Criterion::default())
+    };
+
+    let function_name = find_name(item.clone());
+    let wrapped_name = Ident::new(&format!("criterion_wrapped_{}", function_name.to_string()), span);
+
+    let output = quote_spanned!(span=>
+        #[test_case]
+        pub fn #wrapped_name() {
+            #item
+
+            let mut c = #init.configure_from_args();
+            #function_name(&mut c);
+        }
+    );
+
+    output.into()
+}
+
+fn stream_length(stream: proc_macro2::TokenStream) -> usize {
+    stream.into_iter().count()
+}
+
+fn find_name(stream: proc_macro2::TokenStream) -> Ident {
+    let mut iter = stream.into_iter();
+    while let Some(tok) = iter.next() {
+        if let TokenTree::Ident(ident) = tok {
+            if ident == "fn" {
+                break;
+            }
+        }
+    }
+    
+    if let Some(TokenTree::Ident(name)) = iter.next() {
+        name
+    }
+    else {
+        panic!("Unable to find function name")
+    }
 }
\ No newline at end of file
diff --git a/src/async_executor.rs b/src/async_executor.rs
index d51626448..58877d54b 100644
--- a/src/async_executor.rs
+++ b/src/async_executor.rs
@@ -1,66 +1,66 @@
-//! This module defines a trait that can be used to plug in different Futures executors into
-//! Criterion.rs' async benchmarking support.
-//!
-//! Implementations are provided for:
-//! * Tokio (implemented directly for `tokio::Runtime`)
-//! * Async-std
-//! * Smol
-//! * The Futures crate
-//!
-//! Please note that async benchmarks will have a small amount of measurement overhead relative
-//! to synchronous benchmarks. It is recommended to use synchronous benchmarks where possible, to
-//! improve measurement accuracy.
-
-use std::future::Future;
-
-/// Plugin trait used to allow benchmarking on multiple different async runtimes.
-///
-/// Smol, Tokio and Async-std are supported out of the box, as is the current-thread runner from the
-/// Futures crate; it is recommended to use whichever runtime you use in production.
-pub trait AsyncExecutor {
-    /// Spawn the given future onto this runtime and block until it's complete, returning the result.
-    fn block_on<T>(&self, future: impl Future<Output = T>) -> T;
-}
-
-/// Runs futures on the 'futures' crate's built-in current-thread executor
-#[cfg(feature = "async_futures")]
-pub struct FuturesExecutor;
-#[cfg(feature = "async_futures")]
-impl AsyncExecutor for FuturesExecutor {
-    fn block_on<T>(&self, future: impl Future<Output = T>) -> T {
-        futures::executor::block_on(future)
-    }
-}
-
-/// Runs futures on the 'smol' crate's global executor
-#[cfg(feature = "async_smol")]
-pub struct SmolExecutor;
-#[cfg(feature = "async_smol")]
-impl AsyncExecutor for SmolExecutor {
-    fn block_on<T>(&self, future: impl Future<Output = T>) -> T {
-        smol::block_on(future)
-    }
-}
-
-#[cfg(feature = "async_tokio")]
-impl AsyncExecutor for tokio::runtime::Runtime {
-    fn block_on<T>(&self, future: impl Future<Output = T>) -> T {
-        self.block_on(future)
-    }
-}
-#[cfg(feature = "async_tokio")]
-impl AsyncExecutor for &tokio::runtime::Runtime {
-    fn block_on<T>(&self, future: impl Future<Output = T>) -> T {
-        (*self).block_on(future)
-    }
-}
-
-/// Runs futures on the 'async-std' crate's global executor
-#[cfg(feature = "async_std")]
-pub struct AsyncStdExecutor;
-#[cfg(feature = "async_std")]
-impl AsyncExecutor for AsyncStdExecutor {
-    fn block_on<T>(&self, future: impl Future<Output = T>) -> T {
-        async_std::task::block_on(future)
-    }
-}
+//! This module defines a trait that can be used to plug in different Futures executors into
+//! Criterion.rs' async benchmarking support.
+//!
+//! Implementations are provided for:
+//! * Tokio (implemented directly for `tokio::Runtime`)
+//! * Async-std
+//! * Smol
+//! * The Futures crate
+//!
+//! Please note that async benchmarks will have a small amount of measurement overhead relative
+//! to synchronous benchmarks. It is recommended to use synchronous benchmarks where possible, to
+//! improve measurement accuracy.
+
+use std::future::Future;
+
+/// Plugin trait used to allow benchmarking on multiple different async runtimes.
+///
+/// Smol, Tokio and Async-std are supported out of the box, as is the current-thread runner from the
+/// Futures crate; it is recommended to use whichever runtime you use in production.
+pub trait AsyncExecutor {
+    /// Spawn the given future onto this runtime and block until it's complete, returning the result.
+    fn block_on<T>(&self, future: impl Future<Output = T>) -> T;
+}
+
+/// Runs futures on the 'futures' crate's built-in current-thread executor
+#[cfg(feature = "async_futures")]
+pub struct FuturesExecutor;
+#[cfg(feature = "async_futures")]
+impl AsyncExecutor for FuturesExecutor {
+    fn block_on<T>(&self, future: impl Future<Output = T>) -> T {
+        futures::executor::block_on(future)
+    }
+}
+
+/// Runs futures on the 'smol' crate's global executor
+#[cfg(feature = "async_smol")]
+pub struct SmolExecutor;
+#[cfg(feature = "async_smol")]
+impl AsyncExecutor for SmolExecutor {
+    fn block_on<T>(&self, future: impl Future<Output = T>) -> T {
+        smol::block_on(future)
+    }
+}
+
+#[cfg(feature = "async_tokio")]
+impl AsyncExecutor for tokio::runtime::Runtime {
+    fn block_on<T>(&self, future: impl Future<Output = T>) -> T {
+        self.block_on(future)
+    }
+}
+#[cfg(feature = "async_tokio")]
+impl AsyncExecutor for &tokio::runtime::Runtime {
+    fn block_on<T>(&self, future: impl Future<Output = T>) -> T {
+        (*self).block_on(future)
+    }
+}
+
+/// Runs futures on the 'async-std' crate's global executor
+#[cfg(feature = "async_std")]
+pub struct AsyncStdExecutor;
+#[cfg(feature = "async_std")]
+impl AsyncExecutor for AsyncStdExecutor {
+    fn block_on<T>(&self, future: impl Future<Output = T>) -> T {
+        async_std::task::block_on(future)
+    }
+}
diff --git a/src/bencher.rs b/src/bencher.rs
index b2bd971a8..a508fd3e4 100644
--- a/src/bencher.rs
+++ b/src/bencher.rs
@@ -1,763 +1,763 @@
-use std::time::Duration;
-use std::time::Instant;
-
-use crate::black_box;
-use crate::measurement::{Measurement, WallTime};
-use crate::BatchSize;
-
-#[cfg(feature = "async")]
-use std::future::Future;
-
-#[cfg(feature = "async")]
-use crate::async_executor::AsyncExecutor;
-
-// ================================== MAINTENANCE NOTE =============================================
-// Any changes made to either Bencher or AsyncBencher will have to be replicated to the other!
-// ================================== MAINTENANCE NOTE =============================================
-
-/// Timer struct used to iterate a benchmarked function and measure the runtime.
-///
-/// This struct provides different timing loops as methods. Each timing loop provides a different
-/// way to time a routine and each has advantages and disadvantages.
-///
-/// * If you want to do the iteration and measurement yourself (eg. passing the iteration count
-///   to a separate process), use `iter_custom`.
-/// * If your routine requires no per-iteration setup and returns a value with an expensive `drop`
-///   method, use `iter_with_large_drop`.
-/// * If your routine requires some per-iteration setup that shouldn't be timed, use `iter_batched`
-///   or `iter_batched_ref`. See [`BatchSize`](enum.BatchSize.html) for a discussion of batch sizes.
-///   If the setup value implements `Drop` and you don't want to include the `drop` time in the
-///   measurement, use `iter_batched_ref`, otherwise use `iter_batched`. These methods are also
-///   suitable for benchmarking routines which return a value with an expensive `drop` method,
-///   but are more complex than `iter_with_large_drop`.
-/// * Otherwise, use `iter`.
-pub struct Bencher<'a, M: Measurement = WallTime> {
-    pub(crate) iterated: bool,         // Have we iterated this benchmark?
-    pub(crate) iters: u64,             // Number of times to iterate this benchmark
-    pub(crate) value: M::Value,        // The measured value
-    pub(crate) measurement: &'a M,     // Reference to the measurement object
-    pub(crate) elapsed_time: Duration, // How much time did it take to perform the iteration? Used for the warmup period.
-}
-impl<'a, M: Measurement> Bencher<'a, M> {
-    /// Times a `routine` by executing it many times and timing the total elapsed time.
-    ///
-    /// Prefer this timing loop when `routine` returns a value that doesn't have a destructor.
-    ///
-    /// # Timing model
-    ///
-    /// Note that the `Bencher` also times the time required to destroy the output of `routine()`.
-    /// Therefore prefer this timing loop when the runtime of `mem::drop(O)` is negligible compared
-    /// to the runtime of the `routine`.
-    ///
-    /// ```text
-    /// elapsed = Instant::now + iters * (routine + mem::drop(O) + Range::next)
-    /// ```
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    ///
-    /// use criterion::*;
-    ///
-    /// // The function to benchmark
-    /// fn foo() {
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     c.bench_function("iter", move |b| {
-    ///         b.iter(|| foo())
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    #[inline(never)]
-    pub fn iter<O, R>(&mut self, mut routine: R)
-    where
-        R: FnMut() -> O,
-    {
-        self.iterated = true;
-        let time_start = Instant::now();
-        let start = self.measurement.start();
-        for _ in 0..self.iters {
-            black_box(routine());
-        }
-        self.value = self.measurement.end(start);
-        self.elapsed_time = time_start.elapsed();
-    }
-
-    /// Times a `routine` by executing it many times and relying on `routine` to measure its own execution time.
-    ///
-    /// Prefer this timing loop in cases where `routine` has to do its own measurements to
-    /// get accurate timing information (for example in multi-threaded scenarios where you spawn
-    /// and coordinate with multiple threads).
-    ///
-    /// # Timing model
-    /// Custom, the timing model is whatever is returned as the Duration from `routine`.
-    ///
-    /// # Example
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    /// use criterion::*;
-    /// use criterion::black_box;
-    /// use std::time::Instant;
-    ///
-    /// fn foo() {
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     c.bench_function("iter", move |b| {
-    ///         b.iter_custom(|iters| {
-    ///             let start = Instant::now();
-    ///             for _i in 0..iters {
-    ///                 black_box(foo());
-    ///             }
-    ///             start.elapsed()
-    ///         })
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    #[inline(never)]
-    pub fn iter_custom<R>(&mut self, mut routine: R)
-    where
-        R: FnMut(u64) -> M::Value,
-    {
-        self.iterated = true;
-        let time_start = Instant::now();
-        self.value = routine(self.iters);
-        self.elapsed_time = time_start.elapsed();
-    }
-
-    #[doc(hidden)]
-    pub fn iter_with_setup<I, O, S, R>(&mut self, setup: S, routine: R)
-    where
-        S: FnMut() -> I,
-        R: FnMut(I) -> O,
-    {
-        self.iter_batched(setup, routine, BatchSize::PerIteration);
-    }
-
-    /// Times a `routine` by collecting its output on each iteration. This avoids timing the
-    /// destructor of the value returned by `routine`.
-    ///
-    /// WARNING: This requires `O(iters * mem::size_of::<O>())` of memory, and `iters` is not under the
-    /// control of the caller. If this causes out-of-memory errors, use `iter_batched` instead.
-    ///
-    /// # Timing model
-    ///
-    /// ``` text
-    /// elapsed = Instant::now + iters * (routine) + Iterator::collect::<Vec<_>>
-    /// ```
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    ///
-    /// use criterion::*;
-    ///
-    /// fn create_vector() -> Vec<u64> {
-    ///     # vec![]
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     c.bench_function("with_drop", move |b| {
-    ///         // This will avoid timing the Vec::drop.
-    ///         b.iter_with_large_drop(|| create_vector())
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    pub fn iter_with_large_drop<O, R>(&mut self, mut routine: R)
-    where
-        R: FnMut() -> O,
-    {
-        self.iter_batched(|| (), |_| routine(), BatchSize::SmallInput);
-    }
-
-    /// Times a `routine` that requires some input by generating a batch of input, then timing the
-    /// iteration of the benchmark over the input. See [`BatchSize`](enum.BatchSize.html) for
-    /// details on choosing the batch size. Use this when the routine must consume its input.
-    ///
-    /// For example, use this loop to benchmark sorting algorithms, because they require unsorted
-    /// data on each iteration.
-    ///
-    /// # Timing model
-    ///
-    /// ```text
-    /// elapsed = (Instant::now * num_batches) + (iters * (routine + O::drop)) + Vec::extend
-    /// ```
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    ///
-    /// use criterion::*;
-    ///
-    /// fn create_scrambled_data() -> Vec<u64> {
-    ///     # vec![]
-    ///     // ...
-    /// }
-    ///
-    /// // The sorting algorithm to test
-    /// fn sort(data: &mut [u64]) {
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     let data = create_scrambled_data();
-    ///
-    ///     c.bench_function("with_setup", move |b| {
-    ///         // This will avoid timing the clone call.
-    ///         b.iter_batched(|| data.clone(), |mut data| sort(&mut data), BatchSize::SmallInput)
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    #[inline(never)]
-    pub fn iter_batched<I, O, S, R>(&mut self, mut setup: S, mut routine: R, size: BatchSize)
-    where
-        S: FnMut() -> I,
-        R: FnMut(I) -> O,
-    {
-        self.iterated = true;
-        let batch_size = size.iters_per_batch(self.iters);
-        assert!(batch_size != 0, "Batch size must not be zero.");
-        let time_start = Instant::now();
-        self.value = self.measurement.zero();
-
-        if batch_size == 1 {
-            for _ in 0..self.iters {
-                let input = black_box(setup());
-
-                let start = self.measurement.start();
-                let output = routine(input);
-                let end = self.measurement.end(start);
-                self.value = self.measurement.add(&self.value, &end);
-
-                drop(black_box(output));
-            }
-        } else {
-            let mut iteration_counter = 0;
-
-            while iteration_counter < self.iters {
-                let batch_size = ::std::cmp::min(batch_size, self.iters - iteration_counter);
-
-                let inputs = black_box((0..batch_size).map(|_| setup()).collect::<Vec<_>>());
-                let mut outputs = Vec::with_capacity(batch_size as usize);
-
-                let start = self.measurement.start();
-                outputs.extend(inputs.into_iter().map(&mut routine));
-                let end = self.measurement.end(start);
-                self.value = self.measurement.add(&self.value, &end);
-
-                black_box(outputs);
-
-                iteration_counter += batch_size;
-            }
-        }
-
-        self.elapsed_time = time_start.elapsed();
-    }
-
-    /// Times a `routine` that requires some input by generating a batch of input, then timing the
-    /// iteration of the benchmark over the input. See [`BatchSize`](enum.BatchSize.html) for
-    /// details on choosing the batch size. Use this when the routine should accept the input by
-    /// mutable reference.
-    ///
-    /// For example, use this loop to benchmark sorting algorithms, because they require unsorted
-    /// data on each iteration.
-    ///
-    /// # Timing model
-    ///
-    /// ```text
-    /// elapsed = (Instant::now * num_batches) + (iters * routine) + Vec::extend
-    /// ```
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    ///
-    /// use criterion::*;
-    ///
-    /// fn create_scrambled_data() -> Vec<u64> {
-    ///     # vec![]
-    ///     // ...
-    /// }
-    ///
-    /// // The sorting algorithm to test
-    /// fn sort(data: &mut [u64]) {
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     let data = create_scrambled_data();
-    ///
-    ///     c.bench_function("with_setup", move |b| {
-    ///         // This will avoid timing the clone call.
-    ///         b.iter_batched(|| data.clone(), |mut data| sort(&mut data), BatchSize::SmallInput)
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    #[inline(never)]
-    pub fn iter_batched_ref<I, O, S, R>(&mut self, mut setup: S, mut routine: R, size: BatchSize)
-    where
-        S: FnMut() -> I,
-        R: FnMut(&mut I) -> O,
-    {
-        self.iterated = true;
-        let batch_size = size.iters_per_batch(self.iters);
-        assert!(batch_size != 0, "Batch size must not be zero.");
-        let time_start = Instant::now();
-        self.value = self.measurement.zero();
-
-        if batch_size == 1 {
-            for _ in 0..self.iters {
-                let mut input = black_box(setup());
-
-                let start = self.measurement.start();
-                let output = routine(&mut input);
-                let end = self.measurement.end(start);
-                self.value = self.measurement.add(&self.value, &end);
-
-                drop(black_box(output));
-                drop(black_box(input));
-            }
-        } else {
-            let mut iteration_counter = 0;
-
-            while iteration_counter < self.iters {
-                let batch_size = ::std::cmp::min(batch_size, self.iters - iteration_counter);
-
-                let mut inputs = black_box((0..batch_size).map(|_| setup()).collect::<Vec<_>>());
-                let mut outputs = Vec::with_capacity(batch_size as usize);
-
-                let start = self.measurement.start();
-                outputs.extend(inputs.iter_mut().map(&mut routine));
-                let end = self.measurement.end(start);
-                self.value = self.measurement.add(&self.value, &end);
-
-                black_box(outputs);
-
-                iteration_counter += batch_size;
-            }
-        }
-        self.elapsed_time = time_start.elapsed();
-    }
-
-    // Benchmarks must actually call one of the iter methods. This causes benchmarks to fail loudly
-    // if they don't.
-    pub(crate) fn assert_iterated(&mut self) {
-        assert!(
-            self.iterated,
-            "Benchmark function must call Bencher::iter or related method."
-        );
-        self.iterated = false;
-    }
-
-    /// Convert this bencher into an AsyncBencher, which enables async/await support.
-    #[cfg(feature = "async")]
-    pub fn to_async<'b, A: AsyncExecutor>(&'b mut self, runner: A) -> AsyncBencher<'a, 'b, A, M> {
-        AsyncBencher { b: self, runner }
-    }
-}
-
-/// Async/await variant of the Bencher struct.
-#[cfg(feature = "async")]
-pub struct AsyncBencher<'a, 'b, A: AsyncExecutor, M: Measurement = WallTime> {
-    b: &'b mut Bencher<'a, M>,
-    runner: A,
-}
-#[cfg(feature = "async")]
-impl<'a, 'b, A: AsyncExecutor, M: Measurement> AsyncBencher<'a, 'b, A, M> {
-    /// Times a `routine` by executing it many times and timing the total elapsed time.
-    ///
-    /// Prefer this timing loop when `routine` returns a value that doesn't have a destructor.
-    ///
-    /// # Timing model
-    ///
-    /// Note that the `AsyncBencher` also times the time required to destroy the output of `routine()`.
-    /// Therefore prefer this timing loop when the runtime of `mem::drop(O)` is negligible compared
-    /// to the runtime of the `routine`.
-    ///
-    /// ```text
-    /// elapsed = Instant::now + iters * (routine + mem::drop(O) + Range::next)
-    /// ```
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    ///
-    /// use criterion::*;
-    /// use criterion::async_executor::FuturesExecutor;
-    ///
-    /// // The function to benchmark
-    /// async fn foo() {
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     c.bench_function("iter", move |b| {
-    ///         b.to_async(FuturesExecutor).iter(|| async { foo().await } )
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    #[inline(never)]
-    pub fn iter<O, R, F>(&mut self, mut routine: R)
-    where
-        R: FnMut() -> F,
-        F: Future<Output = O>,
-    {
-        let AsyncBencher { b, runner } = self;
-        runner.block_on(async {
-            b.iterated = true;
-            let time_start = Instant::now();
-            let start = b.measurement.start();
-            for _ in 0..b.iters {
-                black_box(routine().await);
-            }
-            b.value = b.measurement.end(start);
-            b.elapsed_time = time_start.elapsed();
-        });
-    }
-
-    /// Times a `routine` by executing it many times and relying on `routine` to measure its own execution time.
-    ///
-    /// Prefer this timing loop in cases where `routine` has to do its own measurements to
-    /// get accurate timing information (for example in multi-threaded scenarios where you spawn
-    /// and coordinate with multiple threads).
-    ///
-    /// # Timing model
-    /// Custom, the timing model is whatever is returned as the Duration from `routine`.
-    ///
-    /// # Example
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    /// use criterion::*;
-    /// use criterion::black_box;
-    /// use criterion::async_executor::FuturesExecutor;
-    /// use std::time::Instant;
-    ///
-    /// async fn foo() {
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     c.bench_function("iter", move |b| {
-    ///         b.to_async(FuturesExecutor).iter_custom(|iters| {
-    ///             async move {
-    ///                 let start = Instant::now();
-    ///                 for _i in 0..iters {
-    ///                     black_box(foo().await);
-    ///                 }
-    ///                 start.elapsed()
-    ///             }
-    ///         })
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    #[inline(never)]
-    pub fn iter_custom<R, F>(&mut self, mut routine: R)
-    where
-        R: FnMut(u64) -> F,
-        F: Future<Output = M::Value>,
-    {
-        let AsyncBencher { b, runner } = self;
-        runner.block_on(async {
-            b.iterated = true;
-            let time_start = Instant::now();
-            b.value = routine(b.iters).await;
-            b.elapsed_time = time_start.elapsed();
-        })
-    }
-
-    #[doc(hidden)]
-    pub fn iter_with_setup<I, O, S, R, F>(&mut self, setup: S, routine: R)
-    where
-        S: FnMut() -> I,
-        R: FnMut(I) -> F,
-        F: Future<Output = O>,
-    {
-        self.iter_batched(setup, routine, BatchSize::PerIteration);
-    }
-
-    /// Times a `routine` by collecting its output on each iteration. This avoids timing the
-    /// destructor of the value returned by `routine`.
-    ///
-    /// WARNING: This requires `O(iters * mem::size_of::<O>())` of memory, and `iters` is not under the
-    /// control of the caller. If this causes out-of-memory errors, use `iter_batched` instead.
-    ///
-    /// # Timing model
-    ///
-    /// ``` text
-    /// elapsed = Instant::now + iters * (routine) + Iterator::collect::<Vec<_>>
-    /// ```
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    ///
-    /// use criterion::*;
-    /// use criterion::async_executor::FuturesExecutor;
-    ///
-    /// async fn create_vector() -> Vec<u64> {
-    ///     # vec![]
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     c.bench_function("with_drop", move |b| {
-    ///         // This will avoid timing the Vec::drop.
-    ///         b.to_async(FuturesExecutor).iter_with_large_drop(|| async { create_vector().await })
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    pub fn iter_with_large_drop<O, R, F>(&mut self, mut routine: R)
-    where
-        R: FnMut() -> F,
-        F: Future<Output = O>,
-    {
-        self.iter_batched(|| (), |_| routine(), BatchSize::SmallInput);
-    }
-
-    #[doc(hidden)]
-    pub fn iter_with_large_setup<I, O, S, R, F>(&mut self, setup: S, routine: R)
-    where
-        S: FnMut() -> I,
-        R: FnMut(I) -> F,
-        F: Future<Output = O>,
-    {
-        self.iter_batched(setup, routine, BatchSize::NumBatches(1));
-    }
-
-    /// Times a `routine` that requires some input by generating a batch of input, then timing the
-    /// iteration of the benchmark over the input. See [`BatchSize`](enum.BatchSize.html) for
-    /// details on choosing the batch size. Use this when the routine must consume its input.
-    ///
-    /// For example, use this loop to benchmark sorting algorithms, because they require unsorted
-    /// data on each iteration.
-    ///
-    /// # Timing model
-    ///
-    /// ```text
-    /// elapsed = (Instant::now * num_batches) + (iters * (routine + O::drop)) + Vec::extend
-    /// ```
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    ///
-    /// use criterion::*;
-    /// use criterion::async_executor::FuturesExecutor;
-    ///
-    /// fn create_scrambled_data() -> Vec<u64> {
-    ///     # vec![]
-    ///     // ...
-    /// }
-    ///
-    /// // The sorting algorithm to test
-    /// async fn sort(data: &mut [u64]) {
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     let data = create_scrambled_data();
-    ///
-    ///     c.bench_function("with_setup", move |b| {
-    ///         // This will avoid timing the clone call.
-    ///         b.iter_batched(|| data.clone(), |mut data| async move { sort(&mut data).await }, BatchSize::SmallInput)
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    #[inline(never)]
-    pub fn iter_batched<I, O, S, R, F>(&mut self, mut setup: S, mut routine: R, size: BatchSize)
-    where
-        S: FnMut() -> I,
-        R: FnMut(I) -> F,
-        F: Future<Output = O>,
-    {
-        let AsyncBencher { b, runner } = self;
-        runner.block_on(async {
-            b.iterated = true;
-            let batch_size = size.iters_per_batch(b.iters);
-            assert!(batch_size != 0, "Batch size must not be zero.");
-            let time_start = Instant::now();
-            b.value = b.measurement.zero();
-
-            if batch_size == 1 {
-                for _ in 0..b.iters {
-                    let input = black_box(setup());
-
-                    let start = b.measurement.start();
-                    let output = routine(input).await;
-                    let end = b.measurement.end(start);
-                    b.value = b.measurement.add(&b.value, &end);
-
-                    drop(black_box(output));
-                }
-            } else {
-                let mut iteration_counter = 0;
-
-                while iteration_counter < b.iters {
-                    let batch_size = ::std::cmp::min(batch_size, b.iters - iteration_counter);
-
-                    let inputs = black_box((0..batch_size).map(|_| setup()).collect::<Vec<_>>());
-                    let mut outputs = Vec::with_capacity(batch_size as usize);
-
-                    let start = b.measurement.start();
-                    // Can't use .extend here like the sync version does
-                    for input in inputs {
-                        outputs.push(routine(input).await);
-                    }
-                    let end = b.measurement.end(start);
-                    b.value = b.measurement.add(&b.value, &end);
-
-                    black_box(outputs);
-
-                    iteration_counter += batch_size;
-                }
-            }
-
-            b.elapsed_time = time_start.elapsed();
-        })
-    }
-
-    /// Times a `routine` that requires some input by generating a batch of input, then timing the
-    /// iteration of the benchmark over the input. See [`BatchSize`](enum.BatchSize.html) for
-    /// details on choosing the batch size. Use this when the routine should accept the input by
-    /// mutable reference.
-    ///
-    /// For example, use this loop to benchmark sorting algorithms, because they require unsorted
-    /// data on each iteration.
-    ///
-    /// # Timing model
-    ///
-    /// ```text
-    /// elapsed = (Instant::now * num_batches) + (iters * routine) + Vec::extend
-    /// ```
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    ///
-    /// use criterion::*;
-    /// use criterion::async_executor::FuturesExecutor;
-    ///
-    /// fn create_scrambled_data() -> Vec<u64> {
-    ///     # vec![]
-    ///     // ...
-    /// }
-    ///
-    /// // The sorting algorithm to test
-    /// async fn sort(data: &mut [u64]) {
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     let data = create_scrambled_data();
-    ///
-    ///     c.bench_function("with_setup", move |b| {
-    ///         // This will avoid timing the clone call.
-    ///         b.iter_batched(|| data.clone(), |mut data| async move { sort(&mut data).await }, BatchSize::SmallInput)
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    #[inline(never)]
-    pub fn iter_batched_ref<I, O, S, R, F>(&mut self, mut setup: S, mut routine: R, size: BatchSize)
-    where
-        S: FnMut() -> I,
-        R: FnMut(&mut I) -> F,
-        F: Future<Output = O>,
-    {
-        let AsyncBencher { b, runner } = self;
-        runner.block_on(async {
-            b.iterated = true;
-            let batch_size = size.iters_per_batch(b.iters);
-            assert!(batch_size != 0, "Batch size must not be zero.");
-            let time_start = Instant::now();
-            b.value = b.measurement.zero();
-
-            if batch_size == 1 {
-                for _ in 0..b.iters {
-                    let mut input = black_box(setup());
-
-                    let start = b.measurement.start();
-                    let output = routine(&mut input).await;
-                    let end = b.measurement.end(start);
-                    b.value = b.measurement.add(&b.value, &end);
-
-                    drop(black_box(output));
-                    drop(black_box(input));
-                }
-            } else {
-                let mut iteration_counter = 0;
-
-                while iteration_counter < b.iters {
-                    let batch_size = ::std::cmp::min(batch_size, b.iters - iteration_counter);
-
-                    let inputs = black_box((0..batch_size).map(|_| setup()).collect::<Vec<_>>());
-                    let mut outputs = Vec::with_capacity(batch_size as usize);
-
-                    let start = b.measurement.start();
-                    // Can't use .extend here like the sync version does
-                    for mut input in inputs {
-                        outputs.push(routine(&mut input).await);
-                    }
-                    let end = b.measurement.end(start);
-                    b.value = b.measurement.add(&b.value, &end);
-
-                    black_box(outputs);
-
-                    iteration_counter += batch_size;
-                }
-            }
-            b.elapsed_time = time_start.elapsed();
-        });
-    }
-}
+use std::time::Duration;
+use std::time::Instant;
+
+use crate::black_box;
+use crate::measurement::{Measurement, WallTime};
+use crate::BatchSize;
+
+#[cfg(feature = "async")]
+use std::future::Future;
+
+#[cfg(feature = "async")]
+use crate::async_executor::AsyncExecutor;
+
+// ================================== MAINTENANCE NOTE =============================================
+// Any changes made to either Bencher or AsyncBencher will have to be replicated to the other!
+// ================================== MAINTENANCE NOTE =============================================
+
+/// Timer struct used to iterate a benchmarked function and measure the runtime.
+///
+/// This struct provides different timing loops as methods. Each timing loop provides a different
+/// way to time a routine and each has advantages and disadvantages.
+///
+/// * If you want to do the iteration and measurement yourself (eg. passing the iteration count
+///   to a separate process), use `iter_custom`.
+/// * If your routine requires no per-iteration setup and returns a value with an expensive `drop`
+///   method, use `iter_with_large_drop`.
+/// * If your routine requires some per-iteration setup that shouldn't be timed, use `iter_batched`
+///   or `iter_batched_ref`. See [`BatchSize`](enum.BatchSize.html) for a discussion of batch sizes.
+///   If the setup value implements `Drop` and you don't want to include the `drop` time in the
+///   measurement, use `iter_batched_ref`, otherwise use `iter_batched`. These methods are also
+///   suitable for benchmarking routines which return a value with an expensive `drop` method,
+///   but are more complex than `iter_with_large_drop`.
+/// * Otherwise, use `iter`.
+pub struct Bencher<'a, M: Measurement = WallTime> {
+    pub(crate) iterated: bool,         // Have we iterated this benchmark?
+    pub(crate) iters: u64,             // Number of times to iterate this benchmark
+    pub(crate) value: M::Value,        // The measured value
+    pub(crate) measurement: &'a M,     // Reference to the measurement object
+    pub(crate) elapsed_time: Duration, // How much time did it take to perform the iteration? Used for the warmup period.
+}
+impl<'a, M: Measurement> Bencher<'a, M> {
+    /// Times a `routine` by executing it many times and timing the total elapsed time.
+    ///
+    /// Prefer this timing loop when `routine` returns a value that doesn't have a destructor.
+    ///
+    /// # Timing model
+    ///
+    /// Note that the `Bencher` also times the time required to destroy the output of `routine()`.
+    /// Therefore prefer this timing loop when the runtime of `mem::drop(O)` is negligible compared
+    /// to the runtime of the `routine`.
+    ///
+    /// ```text
+    /// elapsed = Instant::now + iters * (routine + mem::drop(O) + Range::next)
+    /// ```
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    ///
+    /// use criterion::*;
+    ///
+    /// // The function to benchmark
+    /// fn foo() {
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     c.bench_function("iter", move |b| {
+    ///         b.iter(|| foo())
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    #[inline(never)]
+    pub fn iter<O, R>(&mut self, mut routine: R)
+    where
+        R: FnMut() -> O,
+    {
+        self.iterated = true;
+        let time_start = Instant::now();
+        let start = self.measurement.start();
+        for _ in 0..self.iters {
+            black_box(routine());
+        }
+        self.value = self.measurement.end(start);
+        self.elapsed_time = time_start.elapsed();
+    }
+
+    /// Times a `routine` by executing it many times and relying on `routine` to measure its own execution time.
+    ///
+    /// Prefer this timing loop in cases where `routine` has to do its own measurements to
+    /// get accurate timing information (for example in multi-threaded scenarios where you spawn
+    /// and coordinate with multiple threads).
+    ///
+    /// # Timing model
+    /// Custom, the timing model is whatever is returned as the Duration from `routine`.
+    ///
+    /// # Example
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    /// use criterion::*;
+    /// use criterion::black_box;
+    /// use std::time::Instant;
+    ///
+    /// fn foo() {
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     c.bench_function("iter", move |b| {
+    ///         b.iter_custom(|iters| {
+    ///             let start = Instant::now();
+    ///             for _i in 0..iters {
+    ///                 black_box(foo());
+    ///             }
+    ///             start.elapsed()
+    ///         })
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    #[inline(never)]
+    pub fn iter_custom<R>(&mut self, mut routine: R)
+    where
+        R: FnMut(u64) -> M::Value,
+    {
+        self.iterated = true;
+        let time_start = Instant::now();
+        self.value = routine(self.iters);
+        self.elapsed_time = time_start.elapsed();
+    }
+
+    #[doc(hidden)]
+    pub fn iter_with_setup<I, O, S, R>(&mut self, setup: S, routine: R)
+    where
+        S: FnMut() -> I,
+        R: FnMut(I) -> O,
+    {
+        self.iter_batched(setup, routine, BatchSize::PerIteration);
+    }
+
+    /// Times a `routine` by collecting its output on each iteration. This avoids timing the
+    /// destructor of the value returned by `routine`.
+    ///
+    /// WARNING: This requires `O(iters * mem::size_of::<O>())` of memory, and `iters` is not under the
+    /// control of the caller. If this causes out-of-memory errors, use `iter_batched` instead.
+    ///
+    /// # Timing model
+    ///
+    /// ``` text
+    /// elapsed = Instant::now + iters * (routine) + Iterator::collect::<Vec<_>>
+    /// ```
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    ///
+    /// use criterion::*;
+    ///
+    /// fn create_vector() -> Vec<u64> {
+    ///     # vec![]
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     c.bench_function("with_drop", move |b| {
+    ///         // This will avoid timing the Vec::drop.
+    ///         b.iter_with_large_drop(|| create_vector())
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    pub fn iter_with_large_drop<O, R>(&mut self, mut routine: R)
+    where
+        R: FnMut() -> O,
+    {
+        self.iter_batched(|| (), |_| routine(), BatchSize::SmallInput);
+    }
+
+    /// Times a `routine` that requires some input by generating a batch of input, then timing the
+    /// iteration of the benchmark over the input. See [`BatchSize`](enum.BatchSize.html) for
+    /// details on choosing the batch size. Use this when the routine must consume its input.
+    ///
+    /// For example, use this loop to benchmark sorting algorithms, because they require unsorted
+    /// data on each iteration.
+    ///
+    /// # Timing model
+    ///
+    /// ```text
+    /// elapsed = (Instant::now * num_batches) + (iters * (routine + O::drop)) + Vec::extend
+    /// ```
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    ///
+    /// use criterion::*;
+    ///
+    /// fn create_scrambled_data() -> Vec<u64> {
+    ///     # vec![]
+    ///     // ...
+    /// }
+    ///
+    /// // The sorting algorithm to test
+    /// fn sort(data: &mut [u64]) {
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     let data = create_scrambled_data();
+    ///
+    ///     c.bench_function("with_setup", move |b| {
+    ///         // This will avoid timing the clone call.
+    ///         b.iter_batched(|| data.clone(), |mut data| sort(&mut data), BatchSize::SmallInput)
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    #[inline(never)]
+    pub fn iter_batched<I, O, S, R>(&mut self, mut setup: S, mut routine: R, size: BatchSize)
+    where
+        S: FnMut() -> I,
+        R: FnMut(I) -> O,
+    {
+        self.iterated = true;
+        let batch_size = size.iters_per_batch(self.iters);
+        assert!(batch_size != 0, "Batch size must not be zero.");
+        let time_start = Instant::now();
+        self.value = self.measurement.zero();
+
+        if batch_size == 1 {
+            for _ in 0..self.iters {
+                let input = black_box(setup());
+
+                let start = self.measurement.start();
+                let output = routine(input);
+                let end = self.measurement.end(start);
+                self.value = self.measurement.add(&self.value, &end);
+
+                drop(black_box(output));
+            }
+        } else {
+            let mut iteration_counter = 0;
+
+            while iteration_counter < self.iters {
+                let batch_size = ::std::cmp::min(batch_size, self.iters - iteration_counter);
+
+                let inputs = black_box((0..batch_size).map(|_| setup()).collect::<Vec<_>>());
+                let mut outputs = Vec::with_capacity(batch_size as usize);
+
+                let start = self.measurement.start();
+                outputs.extend(inputs.into_iter().map(&mut routine));
+                let end = self.measurement.end(start);
+                self.value = self.measurement.add(&self.value, &end);
+
+                black_box(outputs);
+
+                iteration_counter += batch_size;
+            }
+        }
+
+        self.elapsed_time = time_start.elapsed();
+    }
+
+    /// Times a `routine` that requires some input by generating a batch of input, then timing the
+    /// iteration of the benchmark over the input. See [`BatchSize`](enum.BatchSize.html) for
+    /// details on choosing the batch size. Use this when the routine should accept the input by
+    /// mutable reference.
+    ///
+    /// For example, use this loop to benchmark sorting algorithms, because they require unsorted
+    /// data on each iteration.
+    ///
+    /// # Timing model
+    ///
+    /// ```text
+    /// elapsed = (Instant::now * num_batches) + (iters * routine) + Vec::extend
+    /// ```
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    ///
+    /// use criterion::*;
+    ///
+    /// fn create_scrambled_data() -> Vec<u64> {
+    ///     # vec![]
+    ///     // ...
+    /// }
+    ///
+    /// // The sorting algorithm to test
+    /// fn sort(data: &mut [u64]) {
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     let data = create_scrambled_data();
+    ///
+    ///     c.bench_function("with_setup", move |b| {
+    ///         // This will avoid timing the clone call.
+    ///         b.iter_batched(|| data.clone(), |mut data| sort(&mut data), BatchSize::SmallInput)
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    #[inline(never)]
+    pub fn iter_batched_ref<I, O, S, R>(&mut self, mut setup: S, mut routine: R, size: BatchSize)
+    where
+        S: FnMut() -> I,
+        R: FnMut(&mut I) -> O,
+    {
+        self.iterated = true;
+        let batch_size = size.iters_per_batch(self.iters);
+        assert!(batch_size != 0, "Batch size must not be zero.");
+        let time_start = Instant::now();
+        self.value = self.measurement.zero();
+
+        if batch_size == 1 {
+            for _ in 0..self.iters {
+                let mut input = black_box(setup());
+
+                let start = self.measurement.start();
+                let output = routine(&mut input);
+                let end = self.measurement.end(start);
+                self.value = self.measurement.add(&self.value, &end);
+
+                drop(black_box(output));
+                drop(black_box(input));
+            }
+        } else {
+            let mut iteration_counter = 0;
+
+            while iteration_counter < self.iters {
+                let batch_size = ::std::cmp::min(batch_size, self.iters - iteration_counter);
+
+                let mut inputs = black_box((0..batch_size).map(|_| setup()).collect::<Vec<_>>());
+                let mut outputs = Vec::with_capacity(batch_size as usize);
+
+                let start = self.measurement.start();
+                outputs.extend(inputs.iter_mut().map(&mut routine));
+                let end = self.measurement.end(start);
+                self.value = self.measurement.add(&self.value, &end);
+
+                black_box(outputs);
+
+                iteration_counter += batch_size;
+            }
+        }
+        self.elapsed_time = time_start.elapsed();
+    }
+
+    // Benchmarks must actually call one of the iter methods. This causes benchmarks to fail loudly
+    // if they don't.
+    pub(crate) fn assert_iterated(&mut self) {
+        assert!(
+            self.iterated,
+            "Benchmark function must call Bencher::iter or related method."
+        );
+        self.iterated = false;
+    }
+
+    /// Convert this bencher into an AsyncBencher, which enables async/await support.
+    #[cfg(feature = "async")]
+    pub fn to_async<'b, A: AsyncExecutor>(&'b mut self, runner: A) -> AsyncBencher<'a, 'b, A, M> {
+        AsyncBencher { b: self, runner }
+    }
+}
+
+/// Async/await variant of the Bencher struct.
+#[cfg(feature = "async")]
+pub struct AsyncBencher<'a, 'b, A: AsyncExecutor, M: Measurement = WallTime> {
+    b: &'b mut Bencher<'a, M>,
+    runner: A,
+}
+#[cfg(feature = "async")]
+impl<'a, 'b, A: AsyncExecutor, M: Measurement> AsyncBencher<'a, 'b, A, M> {
+    /// Times a `routine` by executing it many times and timing the total elapsed time.
+    ///
+    /// Prefer this timing loop when `routine` returns a value that doesn't have a destructor.
+    ///
+    /// # Timing model
+    ///
+    /// Note that the `AsyncBencher` also times the time required to destroy the output of `routine()`.
+    /// Therefore prefer this timing loop when the runtime of `mem::drop(O)` is negligible compared
+    /// to the runtime of the `routine`.
+    ///
+    /// ```text
+    /// elapsed = Instant::now + iters * (routine + mem::drop(O) + Range::next)
+    /// ```
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    ///
+    /// use criterion::*;
+    /// use criterion::async_executor::FuturesExecutor;
+    ///
+    /// // The function to benchmark
+    /// async fn foo() {
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     c.bench_function("iter", move |b| {
+    ///         b.to_async(FuturesExecutor).iter(|| async { foo().await } )
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    #[inline(never)]
+    pub fn iter<O, R, F>(&mut self, mut routine: R)
+    where
+        R: FnMut() -> F,
+        F: Future<Output = O>,
+    {
+        let AsyncBencher { b, runner } = self;
+        runner.block_on(async {
+            b.iterated = true;
+            let time_start = Instant::now();
+            let start = b.measurement.start();
+            for _ in 0..b.iters {
+                black_box(routine().await);
+            }
+            b.value = b.measurement.end(start);
+            b.elapsed_time = time_start.elapsed();
+        });
+    }
+
+    /// Times a `routine` by executing it many times and relying on `routine` to measure its own execution time.
+    ///
+    /// Prefer this timing loop in cases where `routine` has to do its own measurements to
+    /// get accurate timing information (for example in multi-threaded scenarios where you spawn
+    /// and coordinate with multiple threads).
+    ///
+    /// # Timing model
+    /// Custom, the timing model is whatever is returned as the Duration from `routine`.
+    ///
+    /// # Example
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    /// use criterion::*;
+    /// use criterion::black_box;
+    /// use criterion::async_executor::FuturesExecutor;
+    /// use std::time::Instant;
+    ///
+    /// async fn foo() {
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     c.bench_function("iter", move |b| {
+    ///         b.to_async(FuturesExecutor).iter_custom(|iters| {
+    ///             async move {
+    ///                 let start = Instant::now();
+    ///                 for _i in 0..iters {
+    ///                     black_box(foo().await);
+    ///                 }
+    ///                 start.elapsed()
+    ///             }
+    ///         })
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    #[inline(never)]
+    pub fn iter_custom<R, F>(&mut self, mut routine: R)
+    where
+        R: FnMut(u64) -> F,
+        F: Future<Output = M::Value>,
+    {
+        let AsyncBencher { b, runner } = self;
+        runner.block_on(async {
+            b.iterated = true;
+            let time_start = Instant::now();
+            b.value = routine(b.iters).await;
+            b.elapsed_time = time_start.elapsed();
+        })
+    }
+
+    #[doc(hidden)]
+    pub fn iter_with_setup<I, O, S, R, F>(&mut self, setup: S, routine: R)
+    where
+        S: FnMut() -> I,
+        R: FnMut(I) -> F,
+        F: Future<Output = O>,
+    {
+        self.iter_batched(setup, routine, BatchSize::PerIteration);
+    }
+
+    /// Times a `routine` by collecting its output on each iteration. This avoids timing the
+    /// destructor of the value returned by `routine`.
+    ///
+    /// WARNING: This requires `O(iters * mem::size_of::<O>())` of memory, and `iters` is not under the
+    /// control of the caller. If this causes out-of-memory errors, use `iter_batched` instead.
+    ///
+    /// # Timing model
+    ///
+    /// ``` text
+    /// elapsed = Instant::now + iters * (routine) + Iterator::collect::<Vec<_>>
+    /// ```
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    ///
+    /// use criterion::*;
+    /// use criterion::async_executor::FuturesExecutor;
+    ///
+    /// async fn create_vector() -> Vec<u64> {
+    ///     # vec![]
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     c.bench_function("with_drop", move |b| {
+    ///         // This will avoid timing the Vec::drop.
+    ///         b.to_async(FuturesExecutor).iter_with_large_drop(|| async { create_vector().await })
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    pub fn iter_with_large_drop<O, R, F>(&mut self, mut routine: R)
+    where
+        R: FnMut() -> F,
+        F: Future<Output = O>,
+    {
+        self.iter_batched(|| (), |_| routine(), BatchSize::SmallInput);
+    }
+
+    #[doc(hidden)]
+    pub fn iter_with_large_setup<I, O, S, R, F>(&mut self, setup: S, routine: R)
+    where
+        S: FnMut() -> I,
+        R: FnMut(I) -> F,
+        F: Future<Output = O>,
+    {
+        self.iter_batched(setup, routine, BatchSize::NumBatches(1));
+    }
+
+    /// Times a `routine` that requires some input by generating a batch of input, then timing the
+    /// iteration of the benchmark over the input. See [`BatchSize`](enum.BatchSize.html) for
+    /// details on choosing the batch size. Use this when the routine must consume its input.
+    ///
+    /// For example, use this loop to benchmark sorting algorithms, because they require unsorted
+    /// data on each iteration.
+    ///
+    /// # Timing model
+    ///
+    /// ```text
+    /// elapsed = (Instant::now * num_batches) + (iters * (routine + O::drop)) + Vec::extend
+    /// ```
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    ///
+    /// use criterion::*;
+    /// use criterion::async_executor::FuturesExecutor;
+    ///
+    /// fn create_scrambled_data() -> Vec<u64> {
+    ///     # vec![]
+    ///     // ...
+    /// }
+    ///
+    /// // The sorting algorithm to test
+    /// async fn sort(data: &mut [u64]) {
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     let data = create_scrambled_data();
+    ///
+    ///     c.bench_function("with_setup", move |b| {
+    ///         // This will avoid timing the clone call.
+    ///         b.iter_batched(|| data.clone(), |mut data| async move { sort(&mut data).await }, BatchSize::SmallInput)
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    #[inline(never)]
+    pub fn iter_batched<I, O, S, R, F>(&mut self, mut setup: S, mut routine: R, size: BatchSize)
+    where
+        S: FnMut() -> I,
+        R: FnMut(I) -> F,
+        F: Future<Output = O>,
+    {
+        let AsyncBencher { b, runner } = self;
+        runner.block_on(async {
+            b.iterated = true;
+            let batch_size = size.iters_per_batch(b.iters);
+            assert!(batch_size != 0, "Batch size must not be zero.");
+            let time_start = Instant::now();
+            b.value = b.measurement.zero();
+
+            if batch_size == 1 {
+                for _ in 0..b.iters {
+                    let input = black_box(setup());
+
+                    let start = b.measurement.start();
+                    let output = routine(input).await;
+                    let end = b.measurement.end(start);
+                    b.value = b.measurement.add(&b.value, &end);
+
+                    drop(black_box(output));
+                }
+            } else {
+                let mut iteration_counter = 0;
+
+                while iteration_counter < b.iters {
+                    let batch_size = ::std::cmp::min(batch_size, b.iters - iteration_counter);
+
+                    let inputs = black_box((0..batch_size).map(|_| setup()).collect::<Vec<_>>());
+                    let mut outputs = Vec::with_capacity(batch_size as usize);
+
+                    let start = b.measurement.start();
+                    // Can't use .extend here like the sync version does
+                    for input in inputs {
+                        outputs.push(routine(input).await);
+                    }
+                    let end = b.measurement.end(start);
+                    b.value = b.measurement.add(&b.value, &end);
+
+                    black_box(outputs);
+
+                    iteration_counter += batch_size;
+                }
+            }
+
+            b.elapsed_time = time_start.elapsed();
+        })
+    }
+
+    /// Times a `routine` that requires some input by generating a batch of input, then timing the
+    /// iteration of the benchmark over the input. See [`BatchSize`](enum.BatchSize.html) for
+    /// details on choosing the batch size. Use this when the routine should accept the input by
+    /// mutable reference.
+    ///
+    /// For example, use this loop to benchmark sorting algorithms, because they require unsorted
+    /// data on each iteration.
+    ///
+    /// # Timing model
+    ///
+    /// ```text
+    /// elapsed = (Instant::now * num_batches) + (iters * routine) + Vec::extend
+    /// ```
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    ///
+    /// use criterion::*;
+    /// use criterion::async_executor::FuturesExecutor;
+    ///
+    /// fn create_scrambled_data() -> Vec<u64> {
+    ///     # vec![]
+    ///     // ...
+    /// }
+    ///
+    /// // The sorting algorithm to test
+    /// async fn sort(data: &mut [u64]) {
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     let data = create_scrambled_data();
+    ///
+    ///     c.bench_function("with_setup", move |b| {
+    ///         // This will avoid timing the clone call.
+    ///         b.iter_batched(|| data.clone(), |mut data| async move { sort(&mut data).await }, BatchSize::SmallInput)
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    #[inline(never)]
+    pub fn iter_batched_ref<I, O, S, R, F>(&mut self, mut setup: S, mut routine: R, size: BatchSize)
+    where
+        S: FnMut() -> I,
+        R: FnMut(&mut I) -> F,
+        F: Future<Output = O>,
+    {
+        let AsyncBencher { b, runner } = self;
+        runner.block_on(async {
+            b.iterated = true;
+            let batch_size = size.iters_per_batch(b.iters);
+            assert!(batch_size != 0, "Batch size must not be zero.");
+            let time_start = Instant::now();
+            b.value = b.measurement.zero();
+
+            if batch_size == 1 {
+                for _ in 0..b.iters {
+                    let mut input = black_box(setup());
+
+                    let start = b.measurement.start();
+                    let output = routine(&mut input).await;
+                    let end = b.measurement.end(start);
+                    b.value = b.measurement.add(&b.value, &end);
+
+                    drop(black_box(output));
+                    drop(black_box(input));
+                }
+            } else {
+                let mut iteration_counter = 0;
+
+                while iteration_counter < b.iters {
+                    let batch_size = ::std::cmp::min(batch_size, b.iters - iteration_counter);
+
+                    let inputs = black_box((0..batch_size).map(|_| setup()).collect::<Vec<_>>());
+                    let mut outputs = Vec::with_capacity(batch_size as usize);
+
+                    let start = b.measurement.start();
+                    // Can't use .extend here like the sync version does
+                    for mut input in inputs {
+                        outputs.push(routine(&mut input).await);
+                    }
+                    let end = b.measurement.end(start);
+                    b.value = b.measurement.add(&b.value, &end);
+
+                    black_box(outputs);
+
+                    iteration_counter += batch_size;
+                }
+            }
+            b.elapsed_time = time_start.elapsed();
+        });
+    }
+}
diff --git a/src/connection.rs b/src/connection.rs
index 53706d608..cef6db015 100644
--- a/src/connection.rs
+++ b/src/connection.rs
@@ -1,385 +1,385 @@
-use crate::report::BenchmarkId as InternalBenchmarkId;
-use crate::Throughput;
-use std::cell::RefCell;
-use std::convert::TryFrom;
-use std::io::{Read, Write};
-use std::mem::size_of;
-use std::net::TcpStream;
-
-#[derive(Debug)]
-pub enum MessageError {
-    Deserialization(ciborium::de::Error<std::io::Error>),
-    Serialization(ciborium::ser::Error<std::io::Error>),
-    Io(std::io::Error),
-}
-impl From<ciborium::de::Error<std::io::Error>> for MessageError {
-    fn from(other: ciborium::de::Error<std::io::Error>) -> Self {
-        MessageError::Deserialization(other)
-    }
-}
-impl From<ciborium::ser::Error<std::io::Error>> for MessageError {
-    fn from(other: ciborium::ser::Error<std::io::Error>) -> Self {
-        MessageError::Serialization(other)
-    }
-}
-impl From<std::io::Error> for MessageError {
-    fn from(other: std::io::Error) -> Self {
-        MessageError::Io(other)
-    }
-}
-impl std::fmt::Display for MessageError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            MessageError::Deserialization(error) => write!(
-                f,
-                "Failed to deserialize message to Criterion.rs benchmark:\n{}",
-                error
-            ),
-            MessageError::Serialization(error) => write!(
-                f,
-                "Failed to serialize message to Criterion.rs benchmark:\n{}",
-                error
-            ),
-            MessageError::Io(error) => write!(
-                f,
-                "Failed to read or write message to Criterion.rs benchmark:\n{}",
-                error
-            ),
-        }
-    }
-}
-impl std::error::Error for MessageError {
-    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
-        match self {
-            MessageError::Deserialization(err) => Some(err),
-            MessageError::Serialization(err) => Some(err),
-            MessageError::Io(err) => Some(err),
-        }
-    }
-}
-
-// Use str::len as a const fn once we bump MSRV over 1.39.
-const RUNNER_MAGIC_NUMBER: &str = "cargo-criterion";
-const RUNNER_HELLO_SIZE: usize = 15 //RUNNER_MAGIC_NUMBER.len() // magic number
-    + (size_of::<u8>() * 3); // version number
-
-const BENCHMARK_MAGIC_NUMBER: &str = "Criterion";
-const BENCHMARK_HELLO_SIZE: usize = 9 //BENCHMARK_MAGIC_NUMBER.len() // magic number
-    + (size_of::<u8>() * 3) // version number
-    + size_of::<u16>() // protocol version
-    + size_of::<u16>(); // protocol format
-const PROTOCOL_VERSION: u16 = 1;
-const PROTOCOL_FORMAT: u16 = 1;
-
-#[derive(Debug)]
-struct InnerConnection {
-    socket: TcpStream,
-    receive_buffer: Vec<u8>,
-    send_buffer: Vec<u8>,
-    // runner_version: [u8; 3],
-}
-impl InnerConnection {
-    pub fn new(mut socket: TcpStream) -> Result<Self, std::io::Error> {
-        // read the runner-hello
-        let mut hello_buf = [0u8; RUNNER_HELLO_SIZE];
-        socket.read_exact(&mut hello_buf)?;
-        assert_eq!(
-            &hello_buf[0..RUNNER_MAGIC_NUMBER.len()],
-            RUNNER_MAGIC_NUMBER.as_bytes(),
-            "Not connected to cargo-criterion."
-        );
-
-        let i = RUNNER_MAGIC_NUMBER.len();
-        let runner_version = [hello_buf[i], hello_buf[i + 1], hello_buf[i + 2]];
-
-        info!("Runner version: {:?}", runner_version);
-
-        // now send the benchmark-hello
-        let mut hello_buf = [0u8; BENCHMARK_HELLO_SIZE];
-        hello_buf[0..BENCHMARK_MAGIC_NUMBER.len()]
-            .copy_from_slice(BENCHMARK_MAGIC_NUMBER.as_bytes());
-        let mut i = BENCHMARK_MAGIC_NUMBER.len();
-        hello_buf[i] = env!("CARGO_PKG_VERSION_MAJOR").parse().unwrap();
-        hello_buf[i + 1] = env!("CARGO_PKG_VERSION_MINOR").parse().unwrap();
-        hello_buf[i + 2] = env!("CARGO_PKG_VERSION_PATCH").parse().unwrap();
-        i += 3;
-        hello_buf[i..i + 2].clone_from_slice(&PROTOCOL_VERSION.to_be_bytes());
-        i += 2;
-        hello_buf[i..i + 2].clone_from_slice(&PROTOCOL_FORMAT.to_be_bytes());
-
-        socket.write_all(&hello_buf)?;
-
-        Ok(InnerConnection {
-            socket,
-            receive_buffer: vec![],
-            send_buffer: vec![],
-            // runner_version,
-        })
-    }
-
-    #[allow(dead_code)]
-    pub fn recv(&mut self) -> Result<IncomingMessage, MessageError> {
-        let mut length_buf = [0u8; 4];
-        self.socket.read_exact(&mut length_buf)?;
-        let length = u32::from_be_bytes(length_buf);
-        self.receive_buffer.resize(length as usize, 0u8);
-        self.socket.read_exact(&mut self.receive_buffer)?;
-        let value = ciborium::de::from_reader(&self.receive_buffer[..])?;
-        Ok(value)
-    }
-
-    pub fn send(&mut self, message: &OutgoingMessage) -> Result<(), MessageError> {
-        self.send_buffer.truncate(0);
-        ciborium::ser::into_writer(message, &mut self.send_buffer)?;
-        let size = u32::try_from(self.send_buffer.len()).unwrap();
-        let length_buf = size.to_be_bytes();
-        self.socket.write_all(&length_buf)?;
-        self.socket.write_all(&self.send_buffer)?;
-        Ok(())
-    }
-}
-
-/// This is really just a holder to allow us to send messages through a shared reference to the
-/// connection.
-#[derive(Debug)]
-pub struct Connection {
-    inner: RefCell<InnerConnection>,
-}
-impl Connection {
-    pub fn new(socket: TcpStream) -> Result<Self, std::io::Error> {
-        Ok(Connection {
-            inner: RefCell::new(InnerConnection::new(socket)?),
-        })
-    }
-
-    #[allow(dead_code)]
-    pub fn recv(&self) -> Result<IncomingMessage, MessageError> {
-        self.inner.borrow_mut().recv()
-    }
-
-    pub fn send(&self, message: &OutgoingMessage) -> Result<(), MessageError> {
-        self.inner.borrow_mut().send(message)
-    }
-
-    pub fn serve_value_formatter(
-        &self,
-        formatter: &dyn crate::measurement::ValueFormatter,
-    ) -> Result<(), MessageError> {
-        loop {
-            let response = match self.recv()? {
-                IncomingMessage::FormatValue { value } => OutgoingMessage::FormattedValue {
-                    value: formatter.format_value(value),
-                },
-                IncomingMessage::FormatThroughput { value, throughput } => {
-                    OutgoingMessage::FormattedValue {
-                        value: formatter.format_throughput(&throughput, value),
-                    }
-                }
-                IncomingMessage::ScaleValues {
-                    typical_value,
-                    mut values,
-                } => {
-                    let unit = formatter.scale_values(typical_value, &mut values);
-                    OutgoingMessage::ScaledValues {
-                        unit,
-                        scaled_values: values,
-                    }
-                }
-                IncomingMessage::ScaleThroughputs {
-                    typical_value,
-                    throughput,
-                    mut values,
-                } => {
-                    let unit = formatter.scale_throughputs(typical_value, &throughput, &mut values);
-                    OutgoingMessage::ScaledValues {
-                        unit,
-                        scaled_values: values,
-                    }
-                }
-                IncomingMessage::ScaleForMachines { mut values } => {
-                    let unit = formatter.scale_for_machines(&mut values);
-                    OutgoingMessage::ScaledValues {
-                        unit,
-                        scaled_values: values,
-                    }
-                }
-                IncomingMessage::Continue => break,
-                _ => panic!(),
-            };
-            self.send(&response)?;
-        }
-        Ok(())
-    }
-}
-
-/// Enum defining the messages we can receive
-#[derive(Debug, Deserialize)]
-pub enum IncomingMessage {
-    // Value formatter requests
-    FormatValue {
-        value: f64,
-    },
-    FormatThroughput {
-        value: f64,
-        throughput: Throughput,
-    },
-    ScaleValues {
-        typical_value: f64,
-        values: Vec<f64>,
-    },
-    ScaleThroughputs {
-        typical_value: f64,
-        values: Vec<f64>,
-        throughput: Throughput,
-    },
-    ScaleForMachines {
-        values: Vec<f64>,
-    },
-    Continue,
-
-    __Other,
-}
-
-/// Enum defining the messages we can send
-#[derive(Debug, Serialize)]
-pub enum OutgoingMessage<'a> {
-    BeginningBenchmarkGroup {
-        group: &'a str,
-    },
-    FinishedBenchmarkGroup {
-        group: &'a str,
-    },
-    BeginningBenchmark {
-        id: RawBenchmarkId,
-    },
-    SkippingBenchmark {
-        id: RawBenchmarkId,
-    },
-    Warmup {
-        id: RawBenchmarkId,
-        nanos: f64,
-    },
-    MeasurementStart {
-        id: RawBenchmarkId,
-        sample_count: u64,
-        estimate_ns: f64,
-        iter_count: u64,
-    },
-    MeasurementComplete {
-        id: RawBenchmarkId,
-        iters: &'a [f64],
-        times: &'a [f64],
-        plot_config: PlotConfiguration,
-        sampling_method: SamplingMethod,
-        benchmark_config: BenchmarkConfig,
-    },
-    // value formatter responses
-    FormattedValue {
-        value: String,
-    },
-    ScaledValues {
-        scaled_values: Vec<f64>,
-        unit: &'a str,
-    },
-}
-
-// Also define serializable variants of certain things, either to avoid leaking
-// serializability into the public interface or because the serialized form
-// is a bit different from the regular one.
-
-#[derive(Debug, Serialize)]
-pub struct RawBenchmarkId {
-    group_id: String,
-    function_id: Option<String>,
-    value_str: Option<String>,
-    throughput: Vec<Throughput>,
-}
-impl From<&InternalBenchmarkId> for RawBenchmarkId {
-    fn from(other: &InternalBenchmarkId) -> RawBenchmarkId {
-        RawBenchmarkId {
-            group_id: other.group_id.clone(),
-            function_id: other.function_id.clone(),
-            value_str: other.value_str.clone(),
-            throughput: other.throughput.iter().cloned().collect(),
-        }
-    }
-}
-
-#[derive(Debug, Serialize)]
-pub enum AxisScale {
-    Linear,
-    Logarithmic,
-}
-impl From<crate::AxisScale> for AxisScale {
-    fn from(other: crate::AxisScale) -> Self {
-        match other {
-            crate::AxisScale::Linear => AxisScale::Linear,
-            crate::AxisScale::Logarithmic => AxisScale::Logarithmic,
-        }
-    }
-}
-
-#[derive(Debug, Serialize)]
-pub struct PlotConfiguration {
-    summary_scale: AxisScale,
-}
-impl From<&crate::PlotConfiguration> for PlotConfiguration {
-    fn from(other: &crate::PlotConfiguration) -> Self {
-        PlotConfiguration {
-            summary_scale: other.summary_scale.into(),
-        }
-    }
-}
-
-#[derive(Debug, Serialize)]
-struct Duration {
-    secs: u64,
-    nanos: u32,
-}
-impl From<std::time::Duration> for Duration {
-    fn from(other: std::time::Duration) -> Self {
-        Duration {
-            secs: other.as_secs(),
-            nanos: other.subsec_nanos(),
-        }
-    }
-}
-
-#[derive(Debug, Serialize)]
-pub struct BenchmarkConfig {
-    confidence_level: f64,
-    measurement_time: Duration,
-    noise_threshold: f64,
-    nresamples: usize,
-    sample_size: usize,
-    significance_level: f64,
-    warm_up_time: Duration,
-}
-impl From<&crate::benchmark::BenchmarkConfig> for BenchmarkConfig {
-    fn from(other: &crate::benchmark::BenchmarkConfig) -> Self {
-        BenchmarkConfig {
-            confidence_level: other.confidence_level,
-            measurement_time: other.measurement_time.into(),
-            noise_threshold: other.noise_threshold,
-            nresamples: other.nresamples,
-            sample_size: other.sample_size,
-            significance_level: other.significance_level,
-            warm_up_time: other.warm_up_time.into(),
-        }
-    }
-}
-
-/// Currently not used; defined for forwards compatibility with cargo-criterion.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-pub enum SamplingMethod {
-    Linear,
-    Flat,
-}
-impl From<crate::ActualSamplingMode> for SamplingMethod {
-    fn from(other: crate::ActualSamplingMode) -> Self {
-        match other {
-            crate::ActualSamplingMode::Flat => SamplingMethod::Flat,
-            crate::ActualSamplingMode::Linear => SamplingMethod::Linear,
-        }
-    }
-}
+use crate::report::BenchmarkId as InternalBenchmarkId;
+use crate::Throughput;
+use std::cell::RefCell;
+use std::convert::TryFrom;
+use std::io::{Read, Write};
+use std::mem::size_of;
+use std::net::TcpStream;
+
+#[derive(Debug)]
+pub enum MessageError {
+    Deserialization(ciborium::de::Error<std::io::Error>),
+    Serialization(ciborium::ser::Error<std::io::Error>),
+    Io(std::io::Error),
+}
+impl From<ciborium::de::Error<std::io::Error>> for MessageError {
+    fn from(other: ciborium::de::Error<std::io::Error>) -> Self {
+        MessageError::Deserialization(other)
+    }
+}
+impl From<ciborium::ser::Error<std::io::Error>> for MessageError {
+    fn from(other: ciborium::ser::Error<std::io::Error>) -> Self {
+        MessageError::Serialization(other)
+    }
+}
+impl From<std::io::Error> for MessageError {
+    fn from(other: std::io::Error) -> Self {
+        MessageError::Io(other)
+    }
+}
+impl std::fmt::Display for MessageError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            MessageError::Deserialization(error) => write!(
+                f,
+                "Failed to deserialize message to Criterion.rs benchmark:\n{}",
+                error
+            ),
+            MessageError::Serialization(error) => write!(
+                f,
+                "Failed to serialize message to Criterion.rs benchmark:\n{}",
+                error
+            ),
+            MessageError::Io(error) => write!(
+                f,
+                "Failed to read or write message to Criterion.rs benchmark:\n{}",
+                error
+            ),
+        }
+    }
+}
+impl std::error::Error for MessageError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match self {
+            MessageError::Deserialization(err) => Some(err),
+            MessageError::Serialization(err) => Some(err),
+            MessageError::Io(err) => Some(err),
+        }
+    }
+}
+
+// Use str::len as a const fn once we bump MSRV over 1.39.
+const RUNNER_MAGIC_NUMBER: &str = "cargo-criterion";
+const RUNNER_HELLO_SIZE: usize = 15 //RUNNER_MAGIC_NUMBER.len() // magic number
+    + (size_of::<u8>() * 3); // version number
+
+const BENCHMARK_MAGIC_NUMBER: &str = "Criterion";
+const BENCHMARK_HELLO_SIZE: usize = 9 //BENCHMARK_MAGIC_NUMBER.len() // magic number
+    + (size_of::<u8>() * 3) // version number
+    + size_of::<u16>() // protocol version
+    + size_of::<u16>(); // protocol format
+const PROTOCOL_VERSION: u16 = 1;
+const PROTOCOL_FORMAT: u16 = 1;
+
+#[derive(Debug)]
+struct InnerConnection {
+    socket: TcpStream,
+    receive_buffer: Vec<u8>,
+    send_buffer: Vec<u8>,
+    // runner_version: [u8; 3],
+}
+impl InnerConnection {
+    pub fn new(mut socket: TcpStream) -> Result<Self, std::io::Error> {
+        // read the runner-hello
+        let mut hello_buf = [0u8; RUNNER_HELLO_SIZE];
+        socket.read_exact(&mut hello_buf)?;
+        assert_eq!(
+            &hello_buf[0..RUNNER_MAGIC_NUMBER.len()],
+            RUNNER_MAGIC_NUMBER.as_bytes(),
+            "Not connected to cargo-criterion."
+        );
+
+        let i = RUNNER_MAGIC_NUMBER.len();
+        let runner_version = [hello_buf[i], hello_buf[i + 1], hello_buf[i + 2]];
+
+        info!("Runner version: {:?}", runner_version);
+
+        // now send the benchmark-hello
+        let mut hello_buf = [0u8; BENCHMARK_HELLO_SIZE];
+        hello_buf[0..BENCHMARK_MAGIC_NUMBER.len()]
+            .copy_from_slice(BENCHMARK_MAGIC_NUMBER.as_bytes());
+        let mut i = BENCHMARK_MAGIC_NUMBER.len();
+        hello_buf[i] = env!("CARGO_PKG_VERSION_MAJOR").parse().unwrap();
+        hello_buf[i + 1] = env!("CARGO_PKG_VERSION_MINOR").parse().unwrap();
+        hello_buf[i + 2] = env!("CARGO_PKG_VERSION_PATCH").parse().unwrap();
+        i += 3;
+        hello_buf[i..i + 2].clone_from_slice(&PROTOCOL_VERSION.to_be_bytes());
+        i += 2;
+        hello_buf[i..i + 2].clone_from_slice(&PROTOCOL_FORMAT.to_be_bytes());
+
+        socket.write_all(&hello_buf)?;
+
+        Ok(InnerConnection {
+            socket,
+            receive_buffer: vec![],
+            send_buffer: vec![],
+            // runner_version,
+        })
+    }
+
+    #[allow(dead_code)]
+    pub fn recv(&mut self) -> Result<IncomingMessage, MessageError> {
+        let mut length_buf = [0u8; 4];
+        self.socket.read_exact(&mut length_buf)?;
+        let length = u32::from_be_bytes(length_buf);
+        self.receive_buffer.resize(length as usize, 0u8);
+        self.socket.read_exact(&mut self.receive_buffer)?;
+        let value = ciborium::de::from_reader(&self.receive_buffer[..])?;
+        Ok(value)
+    }
+
+    pub fn send(&mut self, message: &OutgoingMessage) -> Result<(), MessageError> {
+        self.send_buffer.truncate(0);
+        ciborium::ser::into_writer(message, &mut self.send_buffer)?;
+        let size = u32::try_from(self.send_buffer.len()).unwrap();
+        let length_buf = size.to_be_bytes();
+        self.socket.write_all(&length_buf)?;
+        self.socket.write_all(&self.send_buffer)?;
+        Ok(())
+    }
+}
+
+/// This is really just a holder to allow us to send messages through a shared reference to the
+/// connection.
+#[derive(Debug)]
+pub struct Connection {
+    inner: RefCell<InnerConnection>,
+}
+impl Connection {
+    pub fn new(socket: TcpStream) -> Result<Self, std::io::Error> {
+        Ok(Connection {
+            inner: RefCell::new(InnerConnection::new(socket)?),
+        })
+    }
+
+    #[allow(dead_code)]
+    pub fn recv(&self) -> Result<IncomingMessage, MessageError> {
+        self.inner.borrow_mut().recv()
+    }
+
+    pub fn send(&self, message: &OutgoingMessage) -> Result<(), MessageError> {
+        self.inner.borrow_mut().send(message)
+    }
+
+    pub fn serve_value_formatter(
+        &self,
+        formatter: &dyn crate::measurement::ValueFormatter,
+    ) -> Result<(), MessageError> {
+        loop {
+            let response = match self.recv()? {
+                IncomingMessage::FormatValue { value } => OutgoingMessage::FormattedValue {
+                    value: formatter.format_value(value),
+                },
+                IncomingMessage::FormatThroughput { value, throughput } => {
+                    OutgoingMessage::FormattedValue {
+                        value: formatter.format_throughput(&throughput, value),
+                    }
+                }
+                IncomingMessage::ScaleValues {
+                    typical_value,
+                    mut values,
+                } => {
+                    let unit = formatter.scale_values(typical_value, &mut values);
+                    OutgoingMessage::ScaledValues {
+                        unit,
+                        scaled_values: values,
+                    }
+                }
+                IncomingMessage::ScaleThroughputs {
+                    typical_value,
+                    throughput,
+                    mut values,
+                } => {
+                    let unit = formatter.scale_throughputs(typical_value, &throughput, &mut values);
+                    OutgoingMessage::ScaledValues {
+                        unit,
+                        scaled_values: values,
+                    }
+                }
+                IncomingMessage::ScaleForMachines { mut values } => {
+                    let unit = formatter.scale_for_machines(&mut values);
+                    OutgoingMessage::ScaledValues {
+                        unit,
+                        scaled_values: values,
+                    }
+                }
+                IncomingMessage::Continue => break,
+                _ => panic!(),
+            };
+            self.send(&response)?;
+        }
+        Ok(())
+    }
+}
+
+/// Enum defining the messages we can receive
+#[derive(Debug, Deserialize)]
+pub enum IncomingMessage {
+    // Value formatter requests
+    FormatValue {
+        value: f64,
+    },
+    FormatThroughput {
+        value: f64,
+        throughput: Throughput,
+    },
+    ScaleValues {
+        typical_value: f64,
+        values: Vec<f64>,
+    },
+    ScaleThroughputs {
+        typical_value: f64,
+        values: Vec<f64>,
+        throughput: Throughput,
+    },
+    ScaleForMachines {
+        values: Vec<f64>,
+    },
+    Continue,
+
+    __Other,
+}
+
+/// Enum defining the messages we can send
+#[derive(Debug, Serialize)]
+pub enum OutgoingMessage<'a> {
+    BeginningBenchmarkGroup {
+        group: &'a str,
+    },
+    FinishedBenchmarkGroup {
+        group: &'a str,
+    },
+    BeginningBenchmark {
+        id: RawBenchmarkId,
+    },
+    SkippingBenchmark {
+        id: RawBenchmarkId,
+    },
+    Warmup {
+        id: RawBenchmarkId,
+        nanos: f64,
+    },
+    MeasurementStart {
+        id: RawBenchmarkId,
+        sample_count: u64,
+        estimate_ns: f64,
+        iter_count: u64,
+    },
+    MeasurementComplete {
+        id: RawBenchmarkId,
+        iters: &'a [f64],
+        times: &'a [f64],
+        plot_config: PlotConfiguration,
+        sampling_method: SamplingMethod,
+        benchmark_config: BenchmarkConfig,
+    },
+    // value formatter responses
+    FormattedValue {
+        value: String,
+    },
+    ScaledValues {
+        scaled_values: Vec<f64>,
+        unit: &'a str,
+    },
+}
+
+// Also define serializable variants of certain things, either to avoid leaking
+// serializability into the public interface or because the serialized form
+// is a bit different from the regular one.
+
+#[derive(Debug, Serialize)]
+pub struct RawBenchmarkId {
+    group_id: String,
+    function_id: Option<String>,
+    value_str: Option<String>,
+    throughput: Vec<Throughput>,
+}
+impl From<&InternalBenchmarkId> for RawBenchmarkId {
+    fn from(other: &InternalBenchmarkId) -> RawBenchmarkId {
+        RawBenchmarkId {
+            group_id: other.group_id.clone(),
+            function_id: other.function_id.clone(),
+            value_str: other.value_str.clone(),
+            throughput: other.throughput.iter().cloned().collect(),
+        }
+    }
+}
+
+#[derive(Debug, Serialize)]
+pub enum AxisScale {
+    Linear,
+    Logarithmic,
+}
+impl From<crate::AxisScale> for AxisScale {
+    fn from(other: crate::AxisScale) -> Self {
+        match other {
+            crate::AxisScale::Linear => AxisScale::Linear,
+            crate::AxisScale::Logarithmic => AxisScale::Logarithmic,
+        }
+    }
+}
+
+#[derive(Debug, Serialize)]
+pub struct PlotConfiguration {
+    summary_scale: AxisScale,
+}
+impl From<&crate::PlotConfiguration> for PlotConfiguration {
+    fn from(other: &crate::PlotConfiguration) -> Self {
+        PlotConfiguration {
+            summary_scale: other.summary_scale.into(),
+        }
+    }
+}
+
+#[derive(Debug, Serialize)]
+struct Duration {
+    secs: u64,
+    nanos: u32,
+}
+impl From<std::time::Duration> for Duration {
+    fn from(other: std::time::Duration) -> Self {
+        Duration {
+            secs: other.as_secs(),
+            nanos: other.subsec_nanos(),
+        }
+    }
+}
+
+#[derive(Debug, Serialize)]
+pub struct BenchmarkConfig {
+    confidence_level: f64,
+    measurement_time: Duration,
+    noise_threshold: f64,
+    nresamples: usize,
+    sample_size: usize,
+    significance_level: f64,
+    warm_up_time: Duration,
+}
+impl From<&crate::benchmark::BenchmarkConfig> for BenchmarkConfig {
+    fn from(other: &crate::benchmark::BenchmarkConfig) -> Self {
+        BenchmarkConfig {
+            confidence_level: other.confidence_level,
+            measurement_time: other.measurement_time.into(),
+            noise_threshold: other.noise_threshold,
+            nresamples: other.nresamples,
+            sample_size: other.sample_size,
+            significance_level: other.significance_level,
+            warm_up_time: other.warm_up_time.into(),
+        }
+    }
+}
+
+/// Currently not used; defined for forwards compatibility with cargo-criterion.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub enum SamplingMethod {
+    Linear,
+    Flat,
+}
+impl From<crate::ActualSamplingMode> for SamplingMethod {
+    fn from(other: crate::ActualSamplingMode) -> Self {
+        match other {
+            crate::ActualSamplingMode::Flat => SamplingMethod::Flat,
+            crate::ActualSamplingMode::Linear => SamplingMethod::Linear,
+        }
+    }
+}