From 7c575a0b40b4cda2977d10ccc2c007ca2f77f3aa Mon Sep 17 00:00:00 2001
From: Kevin Reid <kpreid@switchb.org>
Date: Fri, 15 Sep 2023 21:16:49 -0700
Subject: [PATCH 01/14] Add details to `RequestDeviceError`. (#4145)

---
 CHANGELOG.md               |  2 +-
 tests/src/lib.rs           |  2 +-
 tests/tests/device.rs      | 51 ++++++++++++++++++++
 wgpu/src/backend/direct.rs |  3 +-
 wgpu/src/backend/web.rs    |  4 +-
 wgpu/src/lib.rs            | 95 ++++++++++++++++++++++++++++++++++++--
 6 files changed, 147 insertions(+), 10 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index db91b89718..ad4c81d076 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -75,7 +75,7 @@ By @Valaphee in [#3402](https://github.com/gfx-rs/wgpu/pull/3402)
 - Omit texture store bound checks since they are no-ops if out of bounds on all APIs. By @teoxoy in [#3975](https://github.com/gfx-rs/wgpu/pull/3975)
 - Validate `DownlevelFlags::READ_ONLY_DEPTH_STENCIL`. By @teoxoy in [#4031](https://github.com/gfx-rs/wgpu/pull/4031)
 - Add validation in accordance with WebGPU `setViewport` valid usage for `x`, `y` and `this.[[attachment_size]]`. By @James2022-rgb in [#4058](https://github.com/gfx-rs/wgpu/pull/4058)
-- `wgpu::CreateSurfaceError` now gives details of the failure, but no longer implements `PartialEq`. By @kpreid in [#4066](https://github.com/gfx-rs/wgpu/pull/4066)
+- `wgpu::CreateSurfaceError` and `wgpu::RequestDeviceError` now give details of the failure, but no longer implement `PartialEq` and cannot be constructed. By @kpreid in [#4066](https://github.com/gfx-rs/wgpu/pull/4066) and [#4145](https://github.com/gfx-rs/wgpu/pull/4145)
 - Make `WGPU_POWER_PREF=none` a valid value. By @fornwall in [4076](https://github.com/gfx-rs/wgpu/pull/4076)
 
 #### Vulkan
diff --git a/tests/src/lib.rs b/tests/src/lib.rs
index 236b353386..c506126708 100644
--- a/tests/src/lib.rs
+++ b/tests/src/lib.rs
@@ -449,7 +449,7 @@ pub fn initialize_test(parameters: TestParameters, test_function: impl FnOnce(Te
     }
 }
 
-fn initialize_adapter() -> (Adapter, Option<SurfaceGuard>) {
+pub fn initialize_adapter() -> (Adapter, Option<SurfaceGuard>) {
     let instance = initialize_instance();
     let surface_guard: Option<SurfaceGuard>;
     let compatible_surface;
diff --git a/tests/tests/device.rs b/tests/tests/device.rs
index f43791f86e..7964f2afdb 100644
--- a/tests/tests/device.rs
+++ b/tests/tests/device.rs
@@ -40,3 +40,54 @@ fn device_mismatch() {
         },
     );
 }
+
+#[cfg(not(all(target_arch = "wasm32", not(target_os = "emscripten"))))]
+#[test]
+fn request_device_error_on_native() {
+    pollster::block_on(request_device_error_message());
+}
+
+/// Check that `RequestDeviceError`s produced have some diagnostic information.
+///
+/// Note: this is a wasm *and* native test. On wasm it is run directly; on native, indirectly
+#[wasm_bindgen_test::wasm_bindgen_test]
+async fn request_device_error_message() {
+    // Not using initialize_test() because that doesn't let us catch the error
+    // nor .await anything
+    let (adapter, _surface_guard) = wgpu_test::initialize_adapter();
+
+    let device_error = adapter
+        .request_device(
+            &wgpu::DeviceDescriptor {
+                // Force a failure by requesting absurd limits.
+                features: wgpu::Features::all(),
+                limits: wgpu::Limits {
+                    max_texture_dimension_1d: u32::MAX,
+                    max_texture_dimension_2d: u32::MAX,
+                    max_texture_dimension_3d: u32::MAX,
+                    max_bind_groups: u32::MAX,
+                    max_push_constant_size: u32::MAX,
+                    ..Default::default()
+                },
+                ..Default::default()
+            },
+            None,
+        )
+        .await
+        .unwrap_err();
+
+    let device_error = device_error.to_string();
+    cfg_if::cfg_if! {
+        if #[cfg(all(target_arch = "wasm32", not(feature = "webgl")))] {
+            // On WebGPU, so the error we get will be from the browser WebGPU API.
+            // Per the WebGPU specification this should be a `TypeError` when features are not
+            // available, <https://gpuweb.github.io/gpuweb/#dom-gpuadapter-requestdevice>,
+            // and the stringification it goes through for Rust should put that in the message.
+            let expected = "TypeError";
+        } else {
+            // This message appears whenever wgpu-core is used as the implementation.
+            let expected = "Unsupported features were requested: Features(";
+        }
+    }
+    assert!(device_error.contains(expected), "{device_error}");
+}
diff --git a/wgpu/src/backend/direct.rs b/wgpu/src/backend/direct.rs
index 8eec9adad5..2e15e295e8 100644
--- a/wgpu/src/backend/direct.rs
+++ b/wgpu/src/backend/direct.rs
@@ -622,8 +622,7 @@ impl crate::Context for Context {
             ()
         ));
         if let Some(err) = error {
-            log::error!("Error in Adapter::request_device: {}", err);
-            return ready(Err(crate::RequestDeviceError));
+            return ready(Err(err.into()));
         }
         let error_sink = Arc::new(Mutex::new(ErrorSinkRaw::new()));
         let device = Device {
diff --git a/wgpu/src/backend/web.rs b/wgpu/src/backend/web.rs
index d64bd8bcb1..2f83d50c55 100644
--- a/wgpu/src/backend/web.rs
+++ b/wgpu/src/backend/web.rs
@@ -812,7 +812,9 @@ fn future_request_device(
 
             (device_id, device_data, queue_id, queue_data)
         })
-        .map_err(|_| crate::RequestDeviceError)
+        .map_err(|error_value| crate::RequestDeviceError {
+            inner: crate::RequestDeviceErrorKind::Web(error_value),
+        })
 }
 
 fn future_pop_error_scope(result: JsFutureResult) -> Option<crate::Error> {
diff --git a/wgpu/src/lib.rs b/wgpu/src/lib.rs
index 94345f1adb..19dc20120d 100644
--- a/wgpu/src/lib.rs
+++ b/wgpu/src/lib.rs
@@ -2738,18 +2738,103 @@ impl Drop for Device {
     }
 }
 
-/// Requesting a device failed.
-#[derive(Clone, PartialEq, Eq, Debug)]
-pub struct RequestDeviceError;
+/// Requesting a device from an [`Adapter`] failed.
+#[derive(Clone, Debug)]
+pub struct RequestDeviceError {
+    inner: RequestDeviceErrorKind,
+}
+#[derive(Clone, Debug)]
+enum RequestDeviceErrorKind {
+    /// Error from [`wgpu_core`].
+    // must match dependency cfg
+    #[cfg(any(
+        not(target_arch = "wasm32"),
+        feature = "webgl",
+        target_os = "emscripten"
+    ))]
+    Core(core::instance::RequestDeviceError),
+
+    /// Error from web API that was called by `wgpu` to request a device.
+    ///
+    /// (This is currently never used by the webgl backend, but it could be.)
+    #[cfg(all(
+        target_arch = "wasm32",
+        not(any(target_os = "emscripten", feature = "webgl"))
+    ))]
+    Web(wasm_bindgen::JsValue),
+}
+
+#[cfg(all(
+    feature = "fragile-send-sync-non-atomic-wasm",
+    not(target_feature = "atomics")
+))]
+unsafe impl Send for RequestDeviceErrorKind {}
+#[cfg(all(
+    feature = "fragile-send-sync-non-atomic-wasm",
+    not(target_feature = "atomics")
+))]
+unsafe impl Sync for RequestDeviceErrorKind {}
+
+#[cfg(any(
+    not(target_arch = "wasm32"),
+    all(
+        feature = "fragile-send-sync-non-atomic-wasm",
+        not(target_feature = "atomics")
+    )
+))]
 static_assertions::assert_impl_all!(RequestDeviceError: Send, Sync);
 
 impl fmt::Display for RequestDeviceError {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "Requesting a device failed")
+        match &self.inner {
+            #[cfg(any(
+                not(target_arch = "wasm32"),
+                feature = "webgl",
+                target_os = "emscripten"
+            ))]
+            RequestDeviceErrorKind::Core(error) => error.fmt(f),
+            #[cfg(all(
+                target_arch = "wasm32",
+                not(any(target_os = "emscripten", feature = "webgl"))
+            ))]
+            RequestDeviceErrorKind::Web(error_js_value) => {
+                // wasm-bindgen provides a reasonable error stringification via `Debug` impl
+                write!(f, "{error_js_value:?}")
+            }
+        }
+    }
+}
+
+impl error::Error for RequestDeviceError {
+    fn source(&self) -> Option<&(dyn error::Error + 'static)> {
+        match &self.inner {
+            #[cfg(any(
+                not(target_arch = "wasm32"),
+                feature = "webgl",
+                target_os = "emscripten"
+            ))]
+            RequestDeviceErrorKind::Core(error) => error.source(),
+            #[cfg(all(
+                target_arch = "wasm32",
+                not(any(target_os = "emscripten", feature = "webgl"))
+            ))]
+            RequestDeviceErrorKind::Web(_) => None,
+        }
     }
 }
 
-impl error::Error for RequestDeviceError {}
+#[cfg(any(
+    not(target_arch = "wasm32"),
+    feature = "webgl",
+    target_os = "emscripten"
+))]
+impl From<core::instance::RequestDeviceError> for RequestDeviceError {
+    fn from(error: core::instance::RequestDeviceError) -> Self {
+        Self {
+            inner: RequestDeviceErrorKind::Core(error),
+        }
+    }
+}
 
 /// [`Instance::create_surface()`] or a related function failed.
 #[derive(Clone, Debug)]

From 0ffdae31a1f75e1041ed4472eb0552c487831efe Mon Sep 17 00:00:00 2001
From: Andreas Reich <r_andreas2@web.de>
Date: Sat, 16 Sep 2023 22:01:46 +0200
Subject: [PATCH 02/14] Metal encoder & pass timestamp support (#4008)

Implements timer queries via write_timestamp on Metal for encoders (whenever timer queries are available) and passes (for Intel/AMD GPUs, where we should advertise TIMESTAMP_QUERY_INSIDE_PASSES now).

Due to some bugs in Metal this was a lot harder than expected. I believe the solution is close to optimal with the current restrictions in place. For details see code comments.
---
 .deny.toml                             |   1 +
 CHANGELOG.md                           |   4 +
 Cargo.lock                             |   3 +-
 Cargo.toml                             |   2 +
 examples/timestamp-queries/src/main.rs |  18 +--
 wgpu-hal/src/metal/adapter.rs          |  46 ++++--
 wgpu-hal/src/metal/command.rs          | 213 +++++++++++++++++++++----
 wgpu-hal/src/metal/mod.rs              |  27 +++-
 wgpu-types/src/lib.rs                  |   7 +-
 9 files changed, 260 insertions(+), 61 deletions(-)

diff --git a/.deny.toml b/.deny.toml
index 5c214bbc28..f7c233c5d4 100644
--- a/.deny.toml
+++ b/.deny.toml
@@ -27,6 +27,7 @@ allow = [
 [sources]
 allow-git = [
 	"https://github.com/grovesNL/glow",
+	"https://github.com/gfx-rs/metal-rs",
 ]
 unknown-registry = "deny"
 unknown-git = "deny"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ad4c81d076..039bce54ab 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -88,6 +88,10 @@ By @Valaphee in [#3402](https://github.com/gfx-rs/wgpu/pull/3402)
 ### Documentation
 - Use WGSL for VertexFormat example types. By @ScanMountGoat in [#4305](https://github.com/gfx-rs/wgpu/pull/4035)
 
+#### Metal
+
+- Support for timestamp queries on encoders and passes. By @wumpf in [#4008](https://github.com/gfx-rs/wgpu/pull/4008)
+
 ### Bug Fixes
 
 #### General
diff --git a/Cargo.lock b/Cargo.lock
index ba1a403628..07ed8c2c66 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1551,8 +1551,7 @@ dependencies = [
 [[package]]
 name = "metal"
 version = "0.26.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "623b5e6cefd76e58f774bd3cc0c6f5c7615c58c03a97815245a25c3c9bdee318"
+source = "git+https://github.com/gfx-rs/metal-rs/?rev=d24f1a4#d24f1a4ae92470bf87a0c65ecfe78c9299835505"
 dependencies = [
  "bitflags 2.4.0",
  "block",
diff --git a/Cargo.toml b/Cargo.toml
index 55c6048b86..22f79b73b1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -158,6 +158,8 @@ termcolor = "1.2.0"
 #glow = { path = "../glow" }
 #d3d12 = { path = "../d3d12-rs" }
 #metal = { path = "../metal-rs" }
+#metal = { path = "../metal-rs" }
+metal = { git = "https://github.com/gfx-rs/metal-rs/", rev = "d24f1a4" } # More timer support via https://github.com/gfx-rs/metal-rs/pull/280
 #web-sys = { path = "../wasm-bindgen/crates/web-sys" }
 #js-sys = { path = "../wasm-bindgen/crates/js-sys" }
 #wasm-bindgen = { path = "../wasm-bindgen" }
diff --git a/examples/timestamp-queries/src/main.rs b/examples/timestamp-queries/src/main.rs
index 3479122c79..f8c524f03c 100644
--- a/examples/timestamp-queries/src/main.rs
+++ b/examples/timestamp-queries/src/main.rs
@@ -47,6 +47,7 @@ impl QueryResults {
     // * compute end
     const NUM_QUERIES: u64 = 8;
 
+    #[allow(clippy::redundant_closure)] // False positive
     fn from_raw_results(timestamps: Vec<u64>, timestamps_inside_passes: bool) -> Self {
         assert_eq!(timestamps.len(), Self::NUM_QUERIES as usize);
 
@@ -60,9 +61,9 @@ impl QueryResults {
         let mut encoder_timestamps = [0, 0];
         encoder_timestamps[0] = get_next_slot();
         let render_start_end_timestamps = [get_next_slot(), get_next_slot()];
-        let render_inside_timestamp = timestamps_inside_passes.then_some(get_next_slot());
+        let render_inside_timestamp = timestamps_inside_passes.then(|| get_next_slot());
         let compute_start_end_timestamps = [get_next_slot(), get_next_slot()];
-        let compute_inside_timestamp = timestamps_inside_passes.then_some(get_next_slot());
+        let compute_inside_timestamp = timestamps_inside_passes.then(|| get_next_slot());
         encoder_timestamps[1] = get_next_slot();
 
         QueryResults {
@@ -79,8 +80,8 @@ impl QueryResults {
         let elapsed_us = |start, end: u64| end.wrapping_sub(start) as f64 * period as f64 / 1000.0;
 
         println!(
-            "Elapsed time render + compute: {:.2} μs",
-            elapsed_us(self.encoder_timestamps[0], self.encoder_timestamps[1])
+            "Elapsed time before render until after compute: {:.2} μs",
+            elapsed_us(self.encoder_timestamps[0], self.encoder_timestamps[1]),
         );
         println!(
             "Elapsed time render pass: {:.2} μs",
@@ -464,13 +465,10 @@ mod tests {
             render_start_end_timestamps[1].wrapping_sub(render_start_end_timestamps[0]);
         let compute_delta =
             compute_start_end_timestamps[1].wrapping_sub(compute_start_end_timestamps[0]);
+        let encoder_delta = encoder_timestamps[1].wrapping_sub(encoder_timestamps[0]);
 
-        // TODO: Metal encoder timestamps aren't implemented yet.
-        if ctx.adapter.get_info().backend != wgpu::Backend::Metal {
-            let encoder_delta = encoder_timestamps[1].wrapping_sub(encoder_timestamps[0]);
-            assert!(encoder_delta > 0);
-            assert!(encoder_delta >= render_delta + compute_delta);
-        }
+        assert!(encoder_delta > 0);
+        assert!(encoder_delta >= render_delta + compute_delta);
 
         if let Some(render_inside_timestamp) = render_inside_timestamp {
             assert!(render_inside_timestamp >= render_start_end_timestamps[0]);
diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs
index bc90954b35..126741d257 100644
--- a/wgpu-hal/src/metal/adapter.rs
+++ b/wgpu-hal/src/metal/adapter.rs
@@ -5,6 +5,8 @@ use wgt::{AstcBlock, AstcChannel};
 
 use std::{sync::Arc, thread};
 
+use super::TimestampQuerySupport;
+
 const MAX_COMMAND_BUFFERS: u64 = 2048;
 
 unsafe impl Send for super::Adapter {}
@@ -536,6 +538,26 @@ impl super::PrivateCapabilities {
             MTLReadWriteTextureTier::TierNone
         };
 
+        let mut timestamp_query_support = TimestampQuerySupport::empty();
+        if version.at_least((11, 0), (14, 0), os_is_mac)
+            && device.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtStageBoundary)
+        {
+            // If we don't support at stage boundary, don't support anything else.
+            timestamp_query_support.insert(TimestampQuerySupport::STAGE_BOUNDARIES);
+
+            if device.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtDrawBoundary) {
+                timestamp_query_support.insert(TimestampQuerySupport::ON_RENDER_ENCODER);
+            }
+            if device.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtDispatchBoundary)
+            {
+                timestamp_query_support.insert(TimestampQuerySupport::ON_COMPUTE_ENCODER);
+            }
+            if device.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtBlitBoundary) {
+                timestamp_query_support.insert(TimestampQuerySupport::ON_BLIT_ENCODER);
+            }
+            // `TimestampQuerySupport::INSIDE_WGPU_PASSES` emerges from the other flags.
+        }
+
         Self {
             family_check,
             msl_version: if os_is_xr || version.at_least((12, 0), (15, 0), os_is_mac) {
@@ -773,13 +795,7 @@ impl super::PrivateCapabilities {
             } else {
                 None
             },
-            support_timestamp_query: version.at_least((11, 0), (14, 0), os_is_mac)
-                && device
-                    .supports_counter_sampling(metal::MTLCounterSamplingPoint::AtStageBoundary),
-            support_timestamp_query_in_passes: version.at_least((11, 0), (14, 0), os_is_mac)
-                && device.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtDrawBoundary)
-                && device
-                    .supports_counter_sampling(metal::MTLCounterSamplingPoint::AtDispatchBoundary),
+            timestamp_query_support,
         }
     }
 
@@ -807,12 +823,16 @@ impl super::PrivateCapabilities {
             | F::DEPTH32FLOAT_STENCIL8
             | F::MULTI_DRAW_INDIRECT;
 
-        features.set(F::TIMESTAMP_QUERY, self.support_timestamp_query);
-        // TODO: Not yet implemented.
-        // features.set(
-        //     F::TIMESTAMP_QUERY_INSIDE_PASSES,
-        //     self.support_timestamp_query_in_passes,
-        // );
+        features.set(
+            F::TIMESTAMP_QUERY,
+            self.timestamp_query_support
+                .contains(TimestampQuerySupport::STAGE_BOUNDARIES),
+        );
+        features.set(
+            F::TIMESTAMP_QUERY_INSIDE_PASSES,
+            self.timestamp_query_support
+                .contains(TimestampQuerySupport::INSIDE_WGPU_PASSES),
+        );
         features.set(F::TEXTURE_COMPRESSION_ASTC, self.format_astc);
         features.set(F::TEXTURE_COMPRESSION_ASTC_HDR, self.format_astc_hdr);
         features.set(F::TEXTURE_COMPRESSION_BC, self.format_bc);
diff --git a/wgpu-hal/src/metal/command.rs b/wgpu-hal/src/metal/command.rs
index cc737fd228..c4b37f9932 100644
--- a/wgpu-hal/src/metal/command.rs
+++ b/wgpu-hal/src/metal/command.rs
@@ -1,4 +1,4 @@
-use super::{conv, AsNative};
+use super::{conv, AsNative, TimestampQuerySupport};
 use crate::CommandEncoder as _;
 use std::{borrow::Cow, mem, ops::Range};
 
@@ -18,6 +18,7 @@ impl Default for super::CommandState {
             storage_buffer_length_map: Default::default(),
             work_group_memory_sizes: Vec::new(),
             push_constants: Vec::new(),
+            pending_timer_queries: Vec::new(),
         }
     }
 }
@@ -26,10 +27,85 @@ impl super::CommandEncoder {
     fn enter_blit(&mut self) -> &metal::BlitCommandEncoderRef {
         if self.state.blit.is_none() {
             debug_assert!(self.state.render.is_none() && self.state.compute.is_none());
+            let cmd_buf = self.raw_cmd_buf.as_ref().unwrap();
+
+            // Take care of pending timer queries.
+            // If we can't use `sample_counters_in_buffer` we have to create a dummy blit encoder!
+            //
+            // There is a known bug in Metal where blit encoders won't write timestamps if they don't have a blit operation.
+            // See https://github.com/gpuweb/gpuweb/issues/2046#issuecomment-1205793680 & https://source.chromium.org/chromium/chromium/src/+/006c4eb70c96229834bbaf271290f40418144cd3:third_party/dawn/src/dawn/native/metal/BackendMTL.mm;l=350
+            //
+            // To make things worse:
+            // * what counts as a blit operation is a bit unclear, experimenting seemed to indicate that resolve_counters doesn't count.
+            // * in some cases (when?) using `set_start_of_encoder_sample_index` doesn't work, so we have to use `set_end_of_encoder_sample_index` instead
+            //
+            // All this means that pretty much the only *reliable* thing as of writing is to:
+            // * create a dummy blit encoder using set_end_of_encoder_sample_index
+            // * do a dummy write that is known to be not optimized out.
+            // * close the encoder since we used set_end_of_encoder_sample_index and don't want to get any extra stuff in there.
+            // * create another encoder for whatever we actually had in mind.
+            let supports_sample_counters_in_buffer = self
+                .shared
+                .private_caps
+                .timestamp_query_support
+                .contains(TimestampQuerySupport::ON_BLIT_ENCODER);
+
+            if !self.state.pending_timer_queries.is_empty() && !supports_sample_counters_in_buffer {
+                objc::rc::autoreleasepool(|| {
+                    let descriptor = metal::BlitPassDescriptor::new();
+                    let mut last_query = None;
+                    for (i, (set, index)) in self.state.pending_timer_queries.drain(..).enumerate()
+                    {
+                        let sba_descriptor = descriptor
+                            .sample_buffer_attachments()
+                            .object_at(i as _)
+                            .unwrap();
+                        sba_descriptor
+                            .set_sample_buffer(set.counter_sample_buffer.as_ref().unwrap());
+
+                        // Here be dragons:
+                        // As mentioned above, for some reasons using the start of the encoder won't yield any results sometimes!
+                        sba_descriptor
+                            .set_start_of_encoder_sample_index(metal::COUNTER_DONT_SAMPLE);
+                        sba_descriptor.set_end_of_encoder_sample_index(index as _);
+
+                        last_query = Some((set, index));
+                    }
+                    let encoder = cmd_buf.blit_command_encoder_with_descriptor(descriptor);
+
+                    // As explained above, we need to do some write:
+                    // Conveniently, we have a buffer with every query set, that we can use for this for a dummy write,
+                    // since we know that it is going to be overwritten again on timer resolve and HAL doesn't define its state before that.
+                    let raw_range = metal::NSRange {
+                        location: last_query.as_ref().unwrap().1 as u64 * crate::QUERY_SIZE,
+                        length: 1,
+                    };
+                    encoder.fill_buffer(
+                        &last_query.as_ref().unwrap().0.raw_buffer,
+                        raw_range,
+                        255, // Don't write 0, so it's easier to identify if something went wrong.
+                    );
+
+                    encoder.end_encoding();
+                });
+            }
+
             objc::rc::autoreleasepool(|| {
-                let cmd_buf = self.raw_cmd_buf.as_ref().unwrap();
                 self.state.blit = Some(cmd_buf.new_blit_command_encoder().to_owned());
             });
+
+            let encoder = self.state.blit.as_ref().unwrap();
+
+            // UNTESTED:
+            // If the above described issue with empty blit encoder applies to `sample_counters_in_buffer` as well, we should use the same workaround instead!
+            for (set, index) in self.state.pending_timer_queries.drain(..) {
+                debug_assert!(supports_sample_counters_in_buffer);
+                encoder.sample_counters_in_buffer(
+                    set.counter_sample_buffer.as_ref().unwrap(),
+                    index as _,
+                    true,
+                )
+            }
         }
         self.state.blit.as_ref().unwrap()
     }
@@ -40,7 +116,7 @@ impl super::CommandEncoder {
         }
     }
 
-    fn enter_any(&mut self) -> Option<&metal::CommandEncoderRef> {
+    fn active_encoder(&mut self) -> Option<&metal::CommandEncoderRef> {
         if let Some(ref encoder) = self.state.render {
             Some(encoder)
         } else if let Some(ref encoder) = self.state.compute {
@@ -127,9 +203,17 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
     }
 
     unsafe fn end_encoding(&mut self) -> Result<super::CommandBuffer, crate::DeviceError> {
+        // Handle pending timer query if any.
+        if !self.state.pending_timer_queries.is_empty() {
+            self.leave_blit();
+            self.enter_blit();
+        }
+
         self.leave_blit();
         debug_assert!(self.state.render.is_none());
         debug_assert!(self.state.compute.is_none());
+        debug_assert!(self.state.pending_timer_queries.is_empty());
+
         Ok(super::CommandBuffer {
             raw: self.raw_cmd_buf.take().unwrap(),
         })
@@ -322,16 +406,43 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
             _ => {}
         }
     }
-    unsafe fn write_timestamp(&mut self, _set: &super::QuerySet, _index: u32) {
-        // TODO: If MTLCounterSamplingPoint::AtDrawBoundary/AtBlitBoundary/AtDispatchBoundary is supported,
-        //       we don't need to insert a new encoder, but can instead use respective current one.
-        //let encoder = self.enter_any().unwrap_or_else(|| self.enter_blit());
+    unsafe fn write_timestamp(&mut self, set: &super::QuerySet, index: u32) {
+        let support = self.shared.private_caps.timestamp_query_support;
+        debug_assert!(
+            support.contains(TimestampQuerySupport::STAGE_BOUNDARIES),
+            "Timestamp queries are not supported"
+        );
+        let sample_buffer = set.counter_sample_buffer.as_ref().unwrap();
+        let with_barrier = true;
+
+        // Try to use an existing encoder for timestamp query if possible.
+        // This works only if it's supported for the active encoder.
+        if let (true, Some(encoder)) = (
+            support.contains(TimestampQuerySupport::ON_BLIT_ENCODER),
+            self.state.blit.as_ref(),
+        ) {
+            encoder.sample_counters_in_buffer(sample_buffer, index as _, with_barrier);
+        } else if let (true, Some(encoder)) = (
+            support.contains(TimestampQuerySupport::ON_RENDER_ENCODER),
+            self.state.render.as_ref(),
+        ) {
+            encoder.sample_counters_in_buffer(sample_buffer, index as _, with_barrier);
+        } else if let (true, Some(encoder)) = (
+            support.contains(TimestampQuerySupport::ON_COMPUTE_ENCODER),
+            self.state.compute.as_ref(),
+        ) {
+            encoder.sample_counters_in_buffer(sample_buffer, index as _, with_barrier);
+        } else {
+            // If we're here it means we either have no encoder open, or it's not supported to sample within them.
+            // If this happens with render/compute open, this is an invalid usage!
+            debug_assert!(self.state.render.is_none() && self.state.compute.is_none());
 
-        // TODO: Otherwise, we need to create a new blit command encoder with a descriptor that inserts the timestamps.
-        // Note that as of writing creating a new encoder is not exposed by the metal crate.
-        // https://developer.apple.com/documentation/metal/mtlcommandbuffer/3564431-makeblitcommandencoder
+            // But otherwise it means we'll put defer this to the next created encoder.
+            self.state.pending_timer_queries.push((set.clone(), index));
 
-        // TODO: Enable respective test in `examples/timestamp-queries/src/tests.rs`.
+            // Ensure we didn't already have a blit open.
+            self.leave_blit();
+        };
     }
 
     unsafe fn reset_queries(&mut self, set: &super::QuerySet, range: Range<u32>) {
@@ -342,6 +453,7 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
         };
         encoder.fill_buffer(&set.raw_buffer, raw_range, 0);
     }
+
     unsafe fn copy_query_results(
         &mut self,
         set: &super::QuerySet,
@@ -454,8 +566,29 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
                 }
             }
 
+            let mut sba_index = 0;
+            let mut next_sba_descriptor = || {
+                let sba_descriptor = descriptor
+                    .sample_buffer_attachments()
+                    .object_at(sba_index)
+                    .unwrap();
+
+                sba_descriptor.set_end_of_vertex_sample_index(metal::COUNTER_DONT_SAMPLE);
+                sba_descriptor.set_start_of_fragment_sample_index(metal::COUNTER_DONT_SAMPLE);
+
+                sba_index += 1;
+                sba_descriptor
+            };
+
+            for (set, index) in self.state.pending_timer_queries.drain(..) {
+                let sba_descriptor = next_sba_descriptor();
+                sba_descriptor.set_sample_buffer(set.counter_sample_buffer.as_ref().unwrap());
+                sba_descriptor.set_start_of_vertex_sample_index(index as _);
+                sba_descriptor.set_end_of_fragment_sample_index(metal::COUNTER_DONT_SAMPLE);
+            }
+
             if let Some(ref timestamp_writes) = desc.timestamp_writes {
-                let sba_descriptor = descriptor.sample_buffer_attachments().object_at(0).unwrap();
+                let sba_descriptor = next_sba_descriptor();
                 sba_descriptor.set_sample_buffer(
                     timestamp_writes
                         .query_set
@@ -464,12 +597,16 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
                         .unwrap(),
                 );
 
-                if let Some(start_index) = timestamp_writes.beginning_of_pass_write_index {
-                    sba_descriptor.set_start_of_vertex_sample_index(start_index as _);
-                }
-                if let Some(end_index) = timestamp_writes.end_of_pass_write_index {
-                    sba_descriptor.set_end_of_fragment_sample_index(end_index as _);
-                }
+                sba_descriptor.set_start_of_vertex_sample_index(
+                    timestamp_writes
+                        .beginning_of_pass_write_index
+                        .map_or(metal::COUNTER_DONT_SAMPLE, |i| i as _),
+                );
+                sba_descriptor.set_end_of_fragment_sample_index(
+                    timestamp_writes
+                        .end_of_pass_write_index
+                        .map_or(metal::COUNTER_DONT_SAMPLE, |i| i as _),
+                );
             }
 
             if let Some(occlusion_query_set) = desc.occlusion_query_set {
@@ -697,19 +834,19 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
     }
 
     unsafe fn insert_debug_marker(&mut self, label: &str) {
-        if let Some(encoder) = self.enter_any() {
+        if let Some(encoder) = self.active_encoder() {
             encoder.insert_debug_signpost(label);
         }
     }
     unsafe fn begin_debug_marker(&mut self, group_label: &str) {
-        if let Some(encoder) = self.enter_any() {
+        if let Some(encoder) = self.active_encoder() {
             encoder.push_debug_group(group_label);
         } else if let Some(ref buf) = self.raw_cmd_buf {
             buf.push_debug_group(group_label);
         }
     }
     unsafe fn end_debug_marker(&mut self) {
-        if let Some(encoder) = self.enter_any() {
+        if let Some(encoder) = self.active_encoder() {
             encoder.pop_debug_group();
         } else if let Some(ref buf) = self.raw_cmd_buf {
             buf.pop_debug_group();
@@ -969,11 +1106,25 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
         objc::rc::autoreleasepool(|| {
             let descriptor = metal::ComputePassDescriptor::new();
 
-            if let Some(timestamp_writes) = desc.timestamp_writes.as_ref() {
+            let mut sba_index = 0;
+            let mut next_sba_descriptor = || {
                 let sba_descriptor = descriptor
                     .sample_buffer_attachments()
-                    .object_at(0 as _)
+                    .object_at(sba_index)
                     .unwrap();
+                sba_index += 1;
+                sba_descriptor
+            };
+
+            for (set, index) in self.state.pending_timer_queries.drain(..) {
+                let sba_descriptor = next_sba_descriptor();
+                sba_descriptor.set_sample_buffer(set.counter_sample_buffer.as_ref().unwrap());
+                sba_descriptor.set_start_of_encoder_sample_index(index as _);
+                sba_descriptor.set_end_of_encoder_sample_index(metal::COUNTER_DONT_SAMPLE);
+            }
+
+            if let Some(timestamp_writes) = desc.timestamp_writes.as_ref() {
+                let sba_descriptor = next_sba_descriptor();
                 sba_descriptor.set_sample_buffer(
                     timestamp_writes
                         .query_set
@@ -982,12 +1133,16 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
                         .unwrap(),
                 );
 
-                if let Some(start_index) = timestamp_writes.beginning_of_pass_write_index {
-                    sba_descriptor.set_start_of_encoder_sample_index(start_index as _);
-                }
-                if let Some(end_index) = timestamp_writes.end_of_pass_write_index {
-                    sba_descriptor.set_end_of_encoder_sample_index(end_index as _);
-                }
+                sba_descriptor.set_start_of_encoder_sample_index(
+                    timestamp_writes
+                        .beginning_of_pass_write_index
+                        .map_or(metal::COUNTER_DONT_SAMPLE, |i| i as _),
+                );
+                sba_descriptor.set_end_of_encoder_sample_index(
+                    timestamp_writes
+                        .end_of_pass_write_index
+                        .map_or(metal::COUNTER_DONT_SAMPLE, |i| i as _),
+                );
             }
 
             let encoder = raw.compute_command_encoder_with_descriptor(descriptor);
diff --git a/wgpu-hal/src/metal/mod.rs b/wgpu-hal/src/metal/mod.rs
index 76f57002ff..c6b91a4f3c 100644
--- a/wgpu-hal/src/metal/mod.rs
+++ b/wgpu-hal/src/metal/mod.rs
@@ -33,6 +33,7 @@ use std::{
 };
 
 use arrayvec::ArrayVec;
+use bitflags::bitflags;
 use metal::foreign_types::ForeignTypeRef as _;
 use parking_lot::Mutex;
 
@@ -143,6 +144,24 @@ impl crate::Instance<Api> for Instance {
     }
 }
 
+bitflags!(
+    /// Similar to `MTLCounterSamplingPoint`, but a bit higher abstracted for our purposes.
+    #[derive(Debug, Copy, Clone)]
+    pub struct TimestampQuerySupport: u32 {
+        /// On creating Metal encoders.
+        const STAGE_BOUNDARIES = 1 << 1;
+        /// Within existing draw encoders.
+        const ON_RENDER_ENCODER = Self::STAGE_BOUNDARIES.bits() | (1 << 2);
+        /// Within existing dispatch encoders.
+        const ON_COMPUTE_ENCODER = Self::STAGE_BOUNDARIES.bits() | (1 << 3);
+        /// Within existing blit encoders.
+        const ON_BLIT_ENCODER = Self::STAGE_BOUNDARIES.bits() | (1 << 4);
+
+        /// Within any wgpu render/compute pass.
+        const INSIDE_WGPU_PASSES = Self::ON_RENDER_ENCODER.bits() | Self::ON_COMPUTE_ENCODER.bits();
+    }
+);
+
 #[allow(dead_code)]
 #[derive(Clone, Debug)]
 struct PrivateCapabilities {
@@ -239,8 +258,7 @@ struct PrivateCapabilities {
     supports_preserve_invariance: bool,
     supports_shader_primitive_index: bool,
     has_unified_memory: Option<bool>,
-    support_timestamp_query: bool,
-    support_timestamp_query_in_passes: bool,
+    timestamp_query_support: TimestampQuerySupport,
 }
 
 #[derive(Clone, Debug)]
@@ -704,7 +722,7 @@ pub struct ComputePipeline {
 unsafe impl Send for ComputePipeline {}
 unsafe impl Sync for ComputePipeline {}
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct QuerySet {
     raw_buffer: metal::Buffer,
     //Metal has a custom buffer for counters.
@@ -787,6 +805,9 @@ struct CommandState {
 
     work_group_memory_sizes: Vec<u32>,
     push_constants: Vec<u32>,
+
+    /// Timer query that should be executed when the next pass starts.
+    pending_timer_queries: Vec<(QuerySet, u32)>,
 }
 
 pub struct CommandEncoder {
diff --git a/wgpu-types/src/lib.rs b/wgpu-types/src/lib.rs
index c892874afa..9f61e2e490 100644
--- a/wgpu-types/src/lib.rs
+++ b/wgpu-types/src/lib.rs
@@ -270,7 +270,7 @@ bitflags::bitflags! {
         /// Supported Platforms:
         /// - Vulkan
         /// - DX12
-        /// - Metal - TODO: Not yet supported on command encoder.
+        /// - Metal
         ///
         /// This is a web and native feature.
         const TIMESTAMP_QUERY = 1 << 1;
@@ -458,10 +458,9 @@ bitflags::bitflags! {
         /// Supported platforms:
         /// - Vulkan
         /// - DX12
+        /// - Metal (AMD & Intel, not Apple GPUs)
         ///
-        /// This is currently unimplemented on Metal.
-        /// When implemented, it will be supported on Metal on AMD and Intel GPUs, but not Apple GPUs.
-        /// (This is a common limitation of tile-based rasterization GPUs)
+        /// This is generally not available on tile-based rasterization GPUs.
         ///
         /// This is a native only feature with a [proposal](https://github.com/gpuweb/gpuweb/blob/0008bd30da2366af88180b511a5d0d0c1dffbc36/proposals/timestamp-query-inside-passes.md) for the web.
         const TIMESTAMP_QUERY_INSIDE_PASSES = 1 << 33;

From f2bd5571863ef967bd730e8713efd69e3293ebdc Mon Sep 17 00:00:00 2001
From: Jim Blandy <jimb@red-bean.com>
Date: Sun, 17 Sep 2023 11:08:42 -0700
Subject: [PATCH 03/14] Tests for wgpu#4139. (#4148)

---
 tests/tests/query_set.rs | 26 ++++++++++++++++++++++++++
 tests/tests/root.rs      |  1 +
 2 files changed, 27 insertions(+)
 create mode 100644 tests/tests/query_set.rs

diff --git a/tests/tests/query_set.rs b/tests/tests/query_set.rs
new file mode 100644
index 0000000000..16e5094089
--- /dev/null
+++ b/tests/tests/query_set.rs
@@ -0,0 +1,26 @@
+use wgpu_test::{initialize_test, FailureCase, TestParameters};
+
+#[test]
+fn drop_failed_timestamp_query_set() {
+    let parameters = TestParameters::default()
+        // https://github.com/gfx-rs/wgpu/issues/4139
+        .expect_fail(FailureCase::always());
+    initialize_test(parameters, |ctx| {
+        // Enter an error scope, so the validation catch-all doesn't
+        // report the error too early.
+        ctx.device.push_error_scope(wgpu::ErrorFilter::Validation);
+
+        // Creating this query set should fail, since we didn't include
+        // TIMESTAMP_QUERY in our required features.
+        let bad_query_set = ctx.device.create_query_set(&wgpu::QuerySetDescriptor {
+            label: Some("doomed query set"),
+            ty: wgpu::QueryType::Timestamp,
+            count: 1,
+        });
+
+        // Dropping this should not panic.
+        drop(bad_query_set);
+
+        assert!(pollster::block_on(ctx.device.pop_error_scope()).is_some());
+    });
+}
diff --git a/tests/tests/root.rs b/tests/tests/root.rs
index 85901ae491..b2695fd827 100644
--- a/tests/tests/root.rs
+++ b/tests/tests/root.rs
@@ -20,6 +20,7 @@ mod instance;
 mod occlusion_query;
 mod partially_bounded_arrays;
 mod poll;
+mod query_set;
 mod queue_transfer;
 mod resource_descriptor_accessor;
 mod resource_error;

From 471229a48f184d9c095c52f7390816f4b4d30e92 Mon Sep 17 00:00:00 2001
From: Jim Blandy <jimb@red-bean.com>
Date: Sun, 17 Sep 2023 12:39:16 -0700
Subject: [PATCH 04/14] Update Naga to df8107b7 (2023-9-15). (#4149)

---
 CHANGELOG.md         | 1 +
 Cargo.lock           | 2 +-
 Cargo.toml           | 2 +-
 wgpu-core/Cargo.toml | 2 +-
 wgpu-hal/Cargo.toml  | 4 ++--
 5 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 039bce54ab..d0eb15b5ae 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -72,6 +72,7 @@ By @Valaphee in [#3402](https://github.com/gfx-rs/wgpu/pull/3402)
 
 #### General
 
+- Update Naga to df8107b7 (2023-9-15). By @jimblandy in [#4149](https://github.com/gfx-rs/wgpu/pull/4149)
 - Omit texture store bound checks since they are no-ops if out of bounds on all APIs. By @teoxoy in [#3975](https://github.com/gfx-rs/wgpu/pull/3975)
 - Validate `DownlevelFlags::READ_ONLY_DEPTH_STENCIL`. By @teoxoy in [#4031](https://github.com/gfx-rs/wgpu/pull/4031)
 - Add validation in accordance with WebGPU `setViewport` valid usage for `x`, `y` and `this.[[attachment_size]]`. By @James2022-rgb in [#4058](https://github.com/gfx-rs/wgpu/pull/4058)
diff --git a/Cargo.lock b/Cargo.lock
index 07ed8c2c66..303e94dbd8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1602,7 +1602,7 @@ dependencies = [
 [[package]]
 name = "naga"
 version = "0.13.0"
-source = "git+https://github.com/gfx-rs/naga?rev=cc87b8f9eb30bb55d0735b89d3df3e099e1a6e7c#cc87b8f9eb30bb55d0735b89d3df3e099e1a6e7c"
+source = "git+https://github.com/gfx-rs/naga?rev=df8107b7#df8107b78812cc2b1e3d5de35279cedc1f0da3fb"
 dependencies = [
  "bit-set",
  "bitflags 2.4.0",
diff --git a/Cargo.toml b/Cargo.toml
index 22f79b73b1..40b8c46c96 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -51,7 +51,7 @@ version = "0.17"
 
 [workspace.dependencies.naga]
 git = "https://github.com/gfx-rs/naga"
-rev = "cc87b8f9eb30bb55d0735b89d3df3e099e1a6e7c"
+rev = "df8107b7"
 version = "0.13.0"
 
 [workspace.dependencies]
diff --git a/wgpu-core/Cargo.toml b/wgpu-core/Cargo.toml
index 5487a8bdc0..19bc3ad64d 100644
--- a/wgpu-core/Cargo.toml
+++ b/wgpu-core/Cargo.toml
@@ -72,7 +72,7 @@ thiserror = "1"
 
 [dependencies.naga]
 git = "https://github.com/gfx-rs/naga"
-rev = "cc87b8f9eb30bb55d0735b89d3df3e099e1a6e7c"
+rev = "df8107b7"
 version = "0.13.0"
 features = ["clone", "span", "validate"]
 
diff --git a/wgpu-hal/Cargo.toml b/wgpu-hal/Cargo.toml
index 225f18256a..3db7363616 100644
--- a/wgpu-hal/Cargo.toml
+++ b/wgpu-hal/Cargo.toml
@@ -120,14 +120,14 @@ android_system_properties = "0.1.1"
 
 [dependencies.naga]
 git = "https://github.com/gfx-rs/naga"
-rev = "cc87b8f9eb30bb55d0735b89d3df3e099e1a6e7c"
+rev = "df8107b7"
 version = "0.13.0"
 features = ["clone"]
 
 # DEV dependencies
 [dev-dependencies.naga]
 git = "https://github.com/gfx-rs/naga"
-rev = "cc87b8f9eb30bb55d0735b89d3df3e099e1a6e7c"
+rev = "df8107b7"
 version = "0.13.0"
 features = ["wgsl-in"]
 

From 8adab259053eb0a9ada89815f3680736f4a1f17b Mon Sep 17 00:00:00 2001
From: Teodor Tanasoaia <28601907+teoxoy@users.noreply.github.com>
Date: Mon, 18 Sep 2023 16:43:11 +0200
Subject: [PATCH 05/14] [d3d12] Document `map_blend_factor` (#4151)

---
 wgpu-hal/src/dx12/conv.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/wgpu-hal/src/dx12/conv.rs b/wgpu-hal/src/dx12/conv.rs
index 8b44ae9c4b..908944567a 100644
--- a/wgpu-hal/src/dx12/conv.rs
+++ b/wgpu-hal/src/dx12/conv.rs
@@ -222,6 +222,10 @@ pub fn map_polygon_mode(mode: wgt::PolygonMode) -> d3d12_ty::D3D12_FILL_MODE {
     }
 }
 
+/// D3D12 doesn't support passing factors ending in `_COLOR` for alpha blending
+/// (see https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ns-d3d12-d3d12_render_target_blend_desc).
+/// Therefore this function takes an additional `is_alpha` argument
+/// which if set will return an equivalent `_ALPHA` factor.
 fn map_blend_factor(factor: wgt::BlendFactor, is_alpha: bool) -> d3d12_ty::D3D12_BLEND {
     use wgt::BlendFactor as Bf;
     match factor {

From 507101987baced0f75978486c4db941113409d40 Mon Sep 17 00:00:00 2001
From: Andreas Reich <r_andreas2@web.de>
Date: Mon, 18 Sep 2023 20:58:41 +0200
Subject: [PATCH 06/14] Make `StoreOp` an enum instead of a bool (#4147)

Co-authored-by: Connor Fitzgerald <connorwadefitzgerald@gmail.com>
---
 CHANGELOG.md                                  | 22 ++++++++
 examples/boids/src/main.rs                    |  2 +-
 examples/bunnymark/src/main.rs                |  2 +-
 examples/capture/src/main.rs                  |  2 +-
 examples/conservative-raster/src/main.rs      |  4 +-
 examples/cube/src/main.rs                     |  2 +-
 examples/hello-triangle/src/main.rs           |  2 +-
 examples/hello-windows/src/main.rs            |  2 +-
 examples/mipmap/src/main.rs                   |  4 +-
 examples/msaa-line/src/main.rs                |  4 +-
 examples/shadow/src/main.rs                   |  6 +--
 examples/skybox/src/main.rs                   |  4 +-
 examples/stencil-triangles/src/main.rs        |  4 +-
 examples/texture-arrays/src/main.rs           |  2 +-
 examples/timestamp-queries/src/main.rs        |  2 +-
 examples/water/src/main.rs                    | 10 ++--
 tests/tests/occlusion_query/mod.rs            |  2 +-
 tests/tests/regression/issue_3457.rs          |  4 +-
 tests/tests/scissor_tests/mod.rs              |  2 +-
 tests/tests/shader_primitive_index/mod.rs     |  2 +-
 .../tests/zero_init_texture_after_discard.rs  | 18 +++----
 wgpu/src/backend/direct.rs                    | 21 ++++----
 wgpu/src/backend/web.rs                       |  9 ++--
 wgpu/src/lib.rs                               | 52 ++++++++++++++++---
 24 files changed, 120 insertions(+), 64 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d0eb15b5ae..32f90b3d90 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -64,6 +64,28 @@ let render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
 
 By @Valaphee in [#3402](https://github.com/gfx-rs/wgpu/pull/3402)
 
+
+#### Render pass store operation is now an enum
+
+`wgpu::Operations::store` used to be an underdocumented boolean value,
+causing misunderstandings of the effect of setting it to `false`.
+
+The API now more closely resembles WebGPU which distinguishes between `store` and `discard`,
+see [WebGPU spec on GPUStoreOp](https://gpuweb.github.io/gpuweb/#enumdef-gpustoreop).
+
+```diff
+// ...
+depth_ops: Some(wgpu::Operations {
+    load: wgpu::LoadOp::Clear(1.0),
+-   store: false,
++   store: wgpu::StoreOp::Discard,
+}),
+// ...
+```
+
+By @wumpf in [#4147](https://github.com/gfx-rs/wgpu/pull/4147)
+
+
 ### Added/New Features
 
 - Add `gles_minor_version` field to `wgpu::InstanceDescriptor`. By @PJB3005 in [#3998](https://github.com/gfx-rs/wgpu/pull/3998)
diff --git a/examples/boids/src/main.rs b/examples/boids/src/main.rs
index e8aa2f71fd..eb5146f8bd 100644
--- a/examples/boids/src/main.rs
+++ b/examples/boids/src/main.rs
@@ -276,7 +276,7 @@ impl wgpu_example::framework::Example for Example {
                 // Not clearing here in order to test wgpu's zero texture initialization on a surface texture.
                 // Users should avoid loading uninitialized memory since this can cause additional overhead.
                 load: wgpu::LoadOp::Load,
-                store: true,
+                store: wgpu::StoreOp::Store,
             },
         })];
         let render_pass_descriptor = wgpu::RenderPassDescriptor {
diff --git a/examples/bunnymark/src/main.rs b/examples/bunnymark/src/main.rs
index 256083eebb..ca8dbbee06 100644
--- a/examples/bunnymark/src/main.rs
+++ b/examples/bunnymark/src/main.rs
@@ -332,7 +332,7 @@ impl wgpu_example::framework::Example for Example {
                     resolve_target: None,
                     ops: wgpu::Operations {
                         load: wgpu::LoadOp::Clear(clear_color),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     },
                 })],
                 depth_stencil_attachment: None,
diff --git a/examples/capture/src/main.rs b/examples/capture/src/main.rs
index b783b3af80..47a453de6b 100644
--- a/examples/capture/src/main.rs
+++ b/examples/capture/src/main.rs
@@ -101,7 +101,7 @@ async fn create_red_image_with_dimensions(
                 resolve_target: None,
                 ops: wgpu::Operations {
                     load: wgpu::LoadOp::Clear(wgpu::Color::RED),
-                    store: true,
+                    store: wgpu::StoreOp::Store,
                 },
             })],
             depth_stencil_attachment: None,
diff --git a/examples/conservative-raster/src/main.rs b/examples/conservative-raster/src/main.rs
index e5cfb4d775..093740a206 100644
--- a/examples/conservative-raster/src/main.rs
+++ b/examples/conservative-raster/src/main.rs
@@ -269,7 +269,7 @@ impl wgpu_example::framework::Example for Example {
                     resolve_target: None,
                     ops: wgpu::Operations {
                         load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     },
                 })],
                 depth_stencil_attachment: None,
@@ -290,7 +290,7 @@ impl wgpu_example::framework::Example for Example {
                     resolve_target: None,
                     ops: wgpu::Operations {
                         load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     },
                 })],
                 depth_stencil_attachment: None,
diff --git a/examples/cube/src/main.rs b/examples/cube/src/main.rs
index a10dfd0fd0..b031e1004c 100644
--- a/examples/cube/src/main.rs
+++ b/examples/cube/src/main.rs
@@ -375,7 +375,7 @@ impl wgpu_example::framework::Example for Example {
                             b: 0.3,
                             a: 1.0,
                         }),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     },
                 })],
                 depth_stencil_attachment: None,
diff --git a/examples/hello-triangle/src/main.rs b/examples/hello-triangle/src/main.rs
index c5432acd07..ebb8f6b736 100644
--- a/examples/hello-triangle/src/main.rs
+++ b/examples/hello-triangle/src/main.rs
@@ -118,7 +118,7 @@ async fn run(event_loop: EventLoop<()>, window: Window) {
                             resolve_target: None,
                             ops: wgpu::Operations {
                                 load: wgpu::LoadOp::Clear(wgpu::Color::GREEN),
-                                store: true,
+                                store: wgpu::StoreOp::Store,
                             },
                         })],
                         depth_stencil_attachment: None,
diff --git a/examples/hello-windows/src/main.rs b/examples/hello-windows/src/main.rs
index f368804c36..ba28341395 100644
--- a/examples/hello-windows/src/main.rs
+++ b/examples/hello-windows/src/main.rs
@@ -131,7 +131,7 @@ async fn run(event_loop: EventLoop<()>, viewports: Vec<(Window, wgpu::Color)>) {
                                 resolve_target: None,
                                 ops: wgpu::Operations {
                                     load: wgpu::LoadOp::Clear(viewport.desc.background),
-                                    store: true,
+                                    store: wgpu::StoreOp::Store,
                                 },
                             })],
                             depth_stencil_attachment: None,
diff --git a/examples/mipmap/src/main.rs b/examples/mipmap/src/main.rs
index a85110ff14..5536579b0b 100644
--- a/examples/mipmap/src/main.rs
+++ b/examples/mipmap/src/main.rs
@@ -163,7 +163,7 @@ impl Example {
                     resolve_target: None,
                     ops: wgpu::Operations {
                         load: wgpu::LoadOp::Clear(wgpu::Color::WHITE),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     },
                 })],
                 depth_stencil_attachment: None,
@@ -490,7 +490,7 @@ impl wgpu_example::framework::Example for Example {
                     resolve_target: None,
                     ops: wgpu::Operations {
                         load: wgpu::LoadOp::Clear(clear_color),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     },
                 })],
                 depth_stencil_attachment: None,
diff --git a/examples/msaa-line/src/main.rs b/examples/msaa-line/src/main.rs
index aa7a277418..07cc4eaf57 100644
--- a/examples/msaa-line/src/main.rs
+++ b/examples/msaa-line/src/main.rs
@@ -282,7 +282,7 @@ impl wgpu_example::framework::Example for Example {
                     resolve_target: None,
                     ops: wgpu::Operations {
                         load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     },
                 }
             } else {
@@ -293,7 +293,7 @@ impl wgpu_example::framework::Example for Example {
                         load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
                         // Storing pre-resolve MSAA data is unnecessary if it isn't used later.
                         // On tile-based GPU, avoid store can reduce your app's memory footprint.
-                        store: false,
+                        store: wgpu::StoreOp::Discard,
                     },
                 }
             };
diff --git a/examples/shadow/src/main.rs b/examples/shadow/src/main.rs
index 3f963d0c53..c63076e6ac 100644
--- a/examples/shadow/src/main.rs
+++ b/examples/shadow/src/main.rs
@@ -773,7 +773,7 @@ impl wgpu_example::framework::Example for Example {
                         view: &light.target_view,
                         depth_ops: Some(wgpu::Operations {
                             load: wgpu::LoadOp::Clear(1.0),
-                            store: true,
+                            store: wgpu::StoreOp::Store,
                         }),
                         stencil_ops: None,
                     }),
@@ -810,14 +810,14 @@ impl wgpu_example::framework::Example for Example {
                             b: 0.3,
                             a: 1.0,
                         }),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     },
                 })],
                 depth_stencil_attachment: Some(wgpu::RenderPassDepthStencilAttachment {
                     view: &self.forward_depth,
                     depth_ops: Some(wgpu::Operations {
                         load: wgpu::LoadOp::Clear(1.0),
-                        store: false,
+                        store: wgpu::StoreOp::Discard,
                     }),
                     stencil_ops: None,
                 }),
diff --git a/examples/skybox/src/main.rs b/examples/skybox/src/main.rs
index d09622f53c..5d91c62865 100644
--- a/examples/skybox/src/main.rs
+++ b/examples/skybox/src/main.rs
@@ -428,14 +428,14 @@ impl wgpu_example::framework::Example for Skybox {
                             b: 0.3,
                             a: 1.0,
                         }),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     },
                 })],
                 depth_stencil_attachment: Some(wgpu::RenderPassDepthStencilAttachment {
                     view: &self.depth_view,
                     depth_ops: Some(wgpu::Operations {
                         load: wgpu::LoadOp::Clear(1.0),
-                        store: false,
+                        store: wgpu::StoreOp::Discard,
                     }),
                     stencil_ops: None,
                 }),
diff --git a/examples/stencil-triangles/src/main.rs b/examples/stencil-triangles/src/main.rs
index 55aad9c9ba..9d918500d5 100644
--- a/examples/stencil-triangles/src/main.rs
+++ b/examples/stencil-triangles/src/main.rs
@@ -200,7 +200,7 @@ impl wgpu_example::framework::Example for Triangles {
                             b: 0.3,
                             a: 1.0,
                         }),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     },
                 })],
                 depth_stencil_attachment: Some(wgpu::RenderPassDepthStencilAttachment {
@@ -208,7 +208,7 @@ impl wgpu_example::framework::Example for Triangles {
                     depth_ops: None,
                     stencil_ops: Some(wgpu::Operations {
                         load: wgpu::LoadOp::Clear(0),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     }),
                 }),
                 timestamp_writes: None,
diff --git a/examples/texture-arrays/src/main.rs b/examples/texture-arrays/src/main.rs
index 373c2396ae..af9cfa56ff 100644
--- a/examples/texture-arrays/src/main.rs
+++ b/examples/texture-arrays/src/main.rs
@@ -379,7 +379,7 @@ impl wgpu_example::framework::Example for Example {
                 resolve_target: None,
                 ops: wgpu::Operations {
                     load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
-                    store: true,
+                    store: wgpu::StoreOp::Store,
                 },
             })],
             depth_stencil_attachment: None,
diff --git a/examples/timestamp-queries/src/main.rs b/examples/timestamp-queries/src/main.rs
index f8c524f03c..d463ea6579 100644
--- a/examples/timestamp-queries/src/main.rs
+++ b/examples/timestamp-queries/src/main.rs
@@ -375,7 +375,7 @@ fn render_pass(
             resolve_target: None,
             ops: wgpu::Operations {
                 load: wgpu::LoadOp::Clear(wgpu::Color::GREEN),
-                store: true,
+                store: wgpu::StoreOp::Store,
             },
         })],
         depth_stencil_attachment: None,
diff --git a/examples/water/src/main.rs b/examples/water/src/main.rs
index 5d5daa1f59..da9f1238ab 100644
--- a/examples/water/src/main.rs
+++ b/examples/water/src/main.rs
@@ -739,7 +739,7 @@ impl wgpu_example::framework::Example for Example {
                     resolve_target: None,
                     ops: wgpu::Operations {
                         load: wgpu::LoadOp::Clear(back_color),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     },
                 })],
                 // We still need to use the depth buffer here
@@ -748,7 +748,7 @@ impl wgpu_example::framework::Example for Example {
                     view: &self.depth_buffer,
                     depth_ops: Some(wgpu::Operations {
                         load: wgpu::LoadOp::Clear(1.0),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     }),
                     stencil_ops: None,
                 }),
@@ -768,14 +768,14 @@ impl wgpu_example::framework::Example for Example {
                     resolve_target: None,
                     ops: wgpu::Operations {
                         load: wgpu::LoadOp::Clear(back_color),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     },
                 })],
                 depth_stencil_attachment: Some(wgpu::RenderPassDepthStencilAttachment {
                     view: &self.depth_buffer,
                     depth_ops: Some(wgpu::Operations {
                         load: wgpu::LoadOp::Clear(1.0),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     }),
                     stencil_ops: None,
                 }),
@@ -797,7 +797,7 @@ impl wgpu_example::framework::Example for Example {
                     resolve_target: None,
                     ops: wgpu::Operations {
                         load: wgpu::LoadOp::Load,
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     },
                 })],
                 depth_stencil_attachment: Some(wgpu::RenderPassDepthStencilAttachment {
diff --git a/tests/tests/occlusion_query/mod.rs b/tests/tests/occlusion_query/mod.rs
index eab0828e41..7747eaa624 100644
--- a/tests/tests/occlusion_query/mod.rs
+++ b/tests/tests/occlusion_query/mod.rs
@@ -69,7 +69,7 @@ fn occlusion_query() {
                     view: &depth_texture_view,
                     depth_ops: Some(wgpu::Operations {
                         load: wgpu::LoadOp::Clear(1.0),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     }),
                     stencil_ops: None,
                 }),
diff --git a/tests/tests/regression/issue_3457.rs b/tests/tests/regression/issue_3457.rs
index 2dccd3d427..0d2c086ed5 100644
--- a/tests/tests/regression/issue_3457.rs
+++ b/tests/tests/regression/issue_3457.rs
@@ -140,7 +140,7 @@ fn pass_reset_vertex_buffer() {
                 resolve_target: None,
                 ops: Operations {
                     load: LoadOp::Clear(Color::BLACK),
-                    store: false,
+                    store: StoreOp::Discard,
                 },
             })],
             depth_stencil_attachment: None,
@@ -175,7 +175,7 @@ fn pass_reset_vertex_buffer() {
                 resolve_target: None,
                 ops: Operations {
                     load: LoadOp::Clear(Color::BLACK),
-                    store: false,
+                    store: StoreOp::Discard,
                 },
             })],
             depth_stencil_attachment: None,
diff --git a/tests/tests/scissor_tests/mod.rs b/tests/tests/scissor_tests/mod.rs
index da050cb61f..a921827e0d 100644
--- a/tests/tests/scissor_tests/mod.rs
+++ b/tests/tests/scissor_tests/mod.rs
@@ -75,7 +75,7 @@ fn scissor_test_impl(ctx: &TestingContext, scissor_rect: Rect, expected_data: [u
                             b: 0.0,
                             a: 0.0,
                         }),
-                        store: true,
+                        store: wgpu::StoreOp::Store,
                     },
                 })],
                 depth_stencil_attachment: None,
diff --git a/tests/tests/shader_primitive_index/mod.rs b/tests/tests/shader_primitive_index/mod.rs
index a05d1cd5f0..2739b2e77d 100644
--- a/tests/tests/shader_primitive_index/mod.rs
+++ b/tests/tests/shader_primitive_index/mod.rs
@@ -180,7 +180,7 @@ fn pulling_common(
             color_attachments: &[Some(wgpu::RenderPassColorAttachment {
                 ops: wgpu::Operations {
                     load: wgpu::LoadOp::Clear(wgpu::Color::WHITE),
-                    store: true,
+                    store: wgpu::StoreOp::Store,
                 },
                 resolve_target: None,
                 view: &color_view,
diff --git a/tests/tests/zero_init_texture_after_discard.rs b/tests/tests/zero_init_texture_after_discard.rs
index 2b757e069a..e47f1aa0fa 100644
--- a/tests/tests/zero_init_texture_after_discard.rs
+++ b/tests/tests/zero_init_texture_after_discard.rs
@@ -155,11 +155,11 @@ impl<'ctx> TestCase<'ctx> {
                     view: &texture.create_view(&TextureViewDescriptor::default()),
                     depth_ops: format.has_depth_aspect().then_some(Operations {
                         load: LoadOp::Clear(1.0),
-                        store: true,
+                        store: StoreOp::Store,
                     }),
                     stencil_ops: format.has_stencil_aspect().then_some(Operations {
                         load: LoadOp::Clear(0xFFFFFFFF),
-                        store: true,
+                        store: StoreOp::Store,
                     }),
                 }),
                 timestamp_writes: None,
@@ -230,7 +230,7 @@ impl<'ctx> TestCase<'ctx> {
                         resolve_target: None,
                         ops: Operations {
                             load: LoadOp::Load,
-                            store: false, // discard!
+                            store: StoreOp::Discard,
                         },
                     },
                 )],
@@ -239,11 +239,11 @@ impl<'ctx> TestCase<'ctx> {
                         view: &self.texture.create_view(&TextureViewDescriptor::default()),
                         depth_ops: self.format.has_depth_aspect().then_some(Operations {
                             load: LoadOp::Load,
-                            store: false, // discard!
+                            store: StoreOp::Discard,
                         }),
                         stencil_ops: self.format.has_stencil_aspect().then_some(Operations {
                             load: LoadOp::Load,
-                            store: false, // discard!
+                            store: StoreOp::Discard,
                         }),
                     },
                 ),
@@ -264,11 +264,11 @@ impl<'ctx> TestCase<'ctx> {
                         view: &self.texture.create_view(&TextureViewDescriptor::default()),
                         depth_ops: Some(Operations {
                             load: LoadOp::Load,
-                            store: false, // discard!
+                            store: StoreOp::Discard,
                         }),
                         stencil_ops: self.format.has_stencil_aspect().then_some(Operations {
                             load: LoadOp::Clear(0),
-                            store: true,
+                            store: StoreOp::Store,
                         }),
                     },
                 ),
@@ -289,11 +289,11 @@ impl<'ctx> TestCase<'ctx> {
                         view: &self.texture.create_view(&TextureViewDescriptor::default()),
                         depth_ops: self.format.has_depth_aspect().then_some(Operations {
                             load: LoadOp::Clear(0.0),
-                            store: true,
+                            store: StoreOp::Store,
                         }),
                         stencil_ops: Some(Operations {
                             load: LoadOp::Load,
-                            store: false, // discard!
+                            store: StoreOp::Discard,
                         }),
                     },
                 ),
diff --git a/wgpu/src/backend/direct.rs b/wgpu/src/backend/direct.rs
index 2e15e295e8..3d3028d334 100644
--- a/wgpu/src/backend/direct.rs
+++ b/wgpu/src/backend/direct.rs
@@ -4,7 +4,7 @@ use crate::{
     BufferDescriptor, CommandEncoderDescriptor, ComputePassDescriptor, ComputePipelineDescriptor,
     DownlevelCapabilities, Features, Label, Limits, LoadOp, MapMode, Operations,
     PipelineLayoutDescriptor, RenderBundleEncoderDescriptor, RenderPipelineDescriptor,
-    SamplerDescriptor, ShaderModuleDescriptor, ShaderModuleDescriptorSpirV, ShaderSource,
+    SamplerDescriptor, ShaderModuleDescriptor, ShaderModuleDescriptorSpirV, ShaderSource, StoreOp,
     SurfaceStatus, TextureDescriptor, TextureViewDescriptor, UncapturedErrorHandler,
 };
 
@@ -392,6 +392,13 @@ fn map_texture_tagged_copy_view(
     }
 }
 
+fn map_store_op(op: StoreOp) -> wgc::command::StoreOp {
+    match op {
+        StoreOp::Store => wgc::command::StoreOp::Store,
+        StoreOp::Discard => wgc::command::StoreOp::Discard,
+    }
+}
+
 fn map_pass_channel<V: Copy + Default>(
     ops: Option<&Operations<V>>,
 ) -> wgc::command::PassChannel<V> {
@@ -401,11 +408,7 @@ fn map_pass_channel<V: Copy + Default>(
             store,
         }) => wgc::command::PassChannel {
             load_op: wgc::command::LoadOp::Clear,
-            store_op: if store {
-                wgc::command::StoreOp::Store
-            } else {
-                wgc::command::StoreOp::Discard
-            },
+            store_op: map_store_op(store),
             clear_value,
             read_only: false,
         },
@@ -414,11 +417,7 @@ fn map_pass_channel<V: Copy + Default>(
             store,
         }) => wgc::command::PassChannel {
             load_op: wgc::command::LoadOp::Load,
-            store_op: if store {
-                wgc::command::StoreOp::Store
-            } else {
-                wgc::command::StoreOp::Discard
-            },
+            store_op: map_store_op(store),
             clear_value: V::default(),
             read_only: false,
         },
diff --git a/wgpu/src/backend/web.rs b/wgpu/src/backend/web.rs
index 2f83d50c55..d457681e57 100644
--- a/wgpu/src/backend/web.rs
+++ b/wgpu/src/backend/web.rs
@@ -621,11 +621,10 @@ fn map_color(color: wgt::Color) -> web_sys::GpuColorDict {
     web_sys::GpuColorDict::new(color.a, color.b, color.g, color.r)
 }
 
-fn map_store_op(store: bool) -> web_sys::GpuStoreOp {
-    if store {
-        web_sys::GpuStoreOp::Store
-    } else {
-        web_sys::GpuStoreOp::Discard
+fn map_store_op(store: crate::StoreOp) -> web_sys::GpuStoreOp {
+    match store {
+        crate::StoreOp::Store => web_sys::GpuStoreOp::Store,
+        crate::StoreOp::Discard => web_sys::GpuStoreOp::Discard,
     }
 }
 
diff --git a/wgpu/src/lib.rs b/wgpu/src/lib.rs
index 19dc20120d..88b852eaa4 100644
--- a/wgpu/src/lib.rs
+++ b/wgpu/src/lib.rs
@@ -1006,16 +1006,24 @@ static_assertions::assert_impl_all!(BufferBinding: Send, Sync);
 
 /// Operation to perform to the output attachment at the start of a render pass.
 ///
-/// The render target must be cleared at least once before its content is loaded.
-///
-/// Corresponds to [WebGPU `GPULoadOp`](https://gpuweb.github.io/gpuweb/#enumdef-gpuloadop).
+/// Corresponds to [WebGPU `GPULoadOp`](https://gpuweb.github.io/gpuweb/#enumdef-gpuloadop),
+/// plus the corresponding clearValue.
 #[derive(Copy, Clone, Debug, Hash, Eq, PartialEq)]
 #[cfg_attr(feature = "trace", derive(serde::Serialize))]
 #[cfg_attr(feature = "replay", derive(serde::Deserialize))]
 pub enum LoadOp<V> {
-    /// Clear with a specified value.
+    /// Loads the specified value for this attachment into the render pass.
+    ///
+    /// On some GPU hardware (primarily mobile), "clear" is significantly cheaper
+    /// because it avoids loading data from main memory into tile-local memory.
+    ///
+    /// On other GPU hardware, there isn’t a significant difference.
+    ///
+    /// As a result, it is recommended to use "clear" rather than "load" in cases
+    /// where the initial value doesn’t matter
+    /// (e.g. the render target will be cleared using a skybox).
     Clear(V),
-    /// Load from memory.
+    /// Loads the existing value for this attachment into the render pass.
     Load,
 }
 
@@ -1025,6 +1033,28 @@ impl<V: Default> Default for LoadOp<V> {
     }
 }
 
+/// Operation to perform to the output attachment at the end of a render pass.
+///
+/// Corresponds to [WebGPU `GPUStoreOp`](https://gpuweb.github.io/gpuweb/#enumdef-gpustoreop).
+#[derive(Copy, Clone, Debug, Hash, Eq, PartialEq, Default)]
+#[cfg_attr(feature = "trace", derive(serde::Serialize))]
+#[cfg_attr(feature = "replay", derive(serde::Deserialize))]
+pub enum StoreOp {
+    /// Stores the resulting value of the render pass for this attachment.
+    #[default]
+    Store,
+    /// Discards the resulting value of the render pass for this attachment.
+    ///
+    /// The attachment will be treated as uninitialized afterwards.
+    /// (If only either Depth or Stencil texture-aspects is set to `Discard`,
+    /// the respective other texture-aspect will be preserved.)
+    ///
+    /// This can be significantly faster on tile-based render hardware.
+    ///
+    /// Prefer this if the attachment is not read by subsequent passes.
+    Discard,
+}
+
 /// Pair of load and store operations for an attachment aspect.
 ///
 /// This type is unique to the Rust API of `wgpu`. In the WebGPU specification,
@@ -1036,14 +1066,18 @@ pub struct Operations<V> {
     /// How data should be read through this attachment.
     pub load: LoadOp<V>,
     /// Whether data will be written to through this attachment.
-    pub store: bool,
+    ///
+    /// Note that resolve textures (if specified) are always written to,
+    /// regardless of this setting.
+    pub store: StoreOp,
 }
 
 impl<V: Default> Default for Operations<V> {
+    #[inline]
     fn default() -> Self {
         Self {
-            load: Default::default(),
-            store: true,
+            load: LoadOp::<V>::default(),
+            store: StoreOp::default(),
         }
     }
 }
@@ -1084,6 +1118,8 @@ pub struct RenderPassColorAttachment<'tex> {
     /// The view to use as an attachment.
     pub view: &'tex TextureView,
     /// The view that will receive the resolved output if multisampling is used.
+    ///
+    /// If set, it is always written to, regardless of how [`Self::ops`] is configured.
     pub resolve_target: Option<&'tex TextureView>,
     /// What operations will be performed on this color attachment.
     pub ops: Operations<Color>,

From 87a0cd0e69dabd66da4d542b1cec0c71765cfc04 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 19 Sep 2023 09:35:02 +0200
Subject: [PATCH 07/14] Bump profiling from 1.0.10 to 1.0.11 (#4153)

Bumps [profiling](https://github.com/aclysma/profiling) from 1.0.10 to 1.0.11.
- [Changelog](https://github.com/aclysma/profiling/blob/master/CHANGELOG.md)
- [Commits](https://github.com/aclysma/profiling/commits)

---
updated-dependencies:
- dependency-name: profiling
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 303e94dbd8..f966a215f0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2126,9 +2126,9 @@ dependencies = [
 
 [[package]]
 name = "profiling"
-version = "1.0.10"
+version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45f10e75d83c7aec79a6aa46f897075890e156b105eebe51cfa0abce51af025f"
+checksum = "f89dff0959d98c9758c88826cc002e2c3d0b9dfac4139711d1f30de442f1139b"
 
 [[package]]
 name = "quote"

From 5c26841d66b7fb0675ba6b8509b483e6ac2c30ee Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 19 Sep 2023 09:35:12 +0200
Subject: [PATCH 08/14] Bump termcolor from 1.2.0 to 1.3.0 (#4152)

Bumps [termcolor](https://github.com/BurntSushi/termcolor) from 1.2.0 to 1.3.0.
- [Commits](https://github.com/BurntSushi/termcolor/compare/1.2.0...1.3.0)

---
updated-dependencies:
- dependency-name: termcolor
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f966a215f0..9978941db4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2632,9 +2632,9 @@ dependencies = [
 
 [[package]]
 name = "termcolor"
-version = "1.2.0"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
+checksum = "6093bad37da69aab9d123a8091e4be0aa4a03e4d601ec641c327398315f62b64"
 dependencies = [
  "winapi-util",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 40b8c46c96..10378fc231 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -142,7 +142,7 @@ deno_web = "0.137.0"
 deno_webidl = "0.106.0"
 deno_webgpu = { path = "./deno_webgpu" }
 tokio = "1.32.0"
-termcolor = "1.2.0"
+termcolor = "1.3.0"
 
 [patch."https://github.com/gfx-rs/naga"]
 #naga = { path = "../naga" }

From dc5beac8c96536d8c8a2ca98c22021dff14e53cf Mon Sep 17 00:00:00 2001
From: Frederik Magnus Johansen Vestre <freqmod@gmail.com>
Date: Tue, 19 Sep 2023 13:26:30 +0200
Subject: [PATCH 09/14] Support dual source blending (#4022)

Co-authored-by: Teodor Tanasoaia <28601907+teoxoy@users.noreply.github.com>
---
 CHANGELOG.md                     |  1 +
 wgpu-core/src/device/resource.rs | 46 +++++++++++++++++++++++++++++++-
 wgpu-core/src/pipeline.rs        |  9 +++++++
 wgpu-core/src/validation.rs      | 14 +++++++++-
 wgpu-hal/src/dx12/adapter.rs     |  4 ++-
 wgpu-hal/src/dx12/conv.rs        | 12 ++++-----
 wgpu-hal/src/gles/adapter.rs     |  4 +++
 wgpu-hal/src/gles/conv.rs        |  4 +++
 wgpu-hal/src/metal/adapter.rs    |  4 +++
 wgpu-hal/src/metal/conv.rs       | 10 +++----
 wgpu-hal/src/vulkan/adapter.rs   |  2 ++
 wgpu-hal/src/vulkan/conv.rs      |  4 +++
 wgpu-types/src/lib.rs            | 37 ++++++++++++++++++++++++-
 wgpu/src/backend/web.rs          |  9 +++++++
 14 files changed, 144 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 32f90b3d90..d60e0cd84f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -100,6 +100,7 @@ By @wumpf in [#4147](https://github.com/gfx-rs/wgpu/pull/4147)
 - Add validation in accordance with WebGPU `setViewport` valid usage for `x`, `y` and `this.[[attachment_size]]`. By @James2022-rgb in [#4058](https://github.com/gfx-rs/wgpu/pull/4058)
 - `wgpu::CreateSurfaceError` and `wgpu::RequestDeviceError` now give details of the failure, but no longer implement `PartialEq` and cannot be constructed. By @kpreid in [#4066](https://github.com/gfx-rs/wgpu/pull/4066) and [#4145](https://github.com/gfx-rs/wgpu/pull/4145)
 - Make `WGPU_POWER_PREF=none` a valid value. By @fornwall in [4076](https://github.com/gfx-rs/wgpu/pull/4076)
+- Support dual source blending in OpenGL ES, Metal, Vulkan & DX12. By @freqmod in [4022](https://github.com/gfx-rs/wgpu/pull/4022)
 
 #### Vulkan
 
diff --git a/wgpu-core/src/device/resource.rs b/wgpu-core/src/device/resource.rs
index 73f1887e10..ba7006d3d5 100644
--- a/wgpu-core/src/device/resource.rs
+++ b/wgpu-core/src/device/resource.rs
@@ -1276,6 +1276,10 @@ impl<A: HalApi> Device<A> {
                 .flags
                 .contains(wgt::DownlevelFlags::MULTISAMPLED_SHADING),
         );
+        caps.set(
+            Caps::DUAL_SOURCE_BLENDING,
+            self.features.contains(wgt::Features::DUAL_SOURCE_BLENDING),
+        );
 
         let info = naga::valid::Validator::new(naga::valid::ValidationFlags::all(), caps)
             .validate(&module)
@@ -2560,6 +2564,8 @@ impl<A: HalApi> Device<A> {
         let mut vertex_steps = Vec::with_capacity(desc.vertex.buffers.len());
         let mut vertex_buffers = Vec::with_capacity(desc.vertex.buffers.len());
         let mut total_attributes = 0;
+        let mut shader_expects_dual_source_blending = false;
+        let mut pipeline_expects_dual_source_blending = false;
         for (i, vb_state) in desc.vertex.buffers.iter().enumerate() {
             vertex_steps.push(pipeline::VertexStep {
                 stride: vb_state.array_stride,
@@ -2700,7 +2706,25 @@ impl<A: HalApi> Device<A> {
                     {
                         break Some(pipeline::ColorStateError::FormatNotMultisampled(cs.format));
                     }
-
+                    if let Some(blend_mode) = cs.blend {
+                        for factor in [
+                            blend_mode.color.src_factor,
+                            blend_mode.color.dst_factor,
+                            blend_mode.alpha.src_factor,
+                            blend_mode.alpha.dst_factor,
+                        ] {
+                            if factor.ref_second_blend_source() {
+                                self.require_features(wgt::Features::DUAL_SOURCE_BLENDING)?;
+                                if i == 0 {
+                                    pipeline_expects_dual_source_blending = true;
+                                    break;
+                                } else {
+                                    return Err(crate::pipeline::CreateRenderPipelineError
+                            ::BlendFactorOnUnsupportedTarget { factor, target: i as u32 });
+                                }
+                            }
+                        }
+                    }
                     break None;
                 };
                 if let Some(e) = error {
@@ -2857,6 +2881,15 @@ impl<A: HalApi> Device<A> {
                     }
                 }
 
+                if let Some(ref interface) = shader_module.interface {
+                    shader_expects_dual_source_blending = interface
+                        .fragment_uses_dual_source_blending(&fragment.stage.entry_point)
+                        .map_err(|error| pipeline::CreateRenderPipelineError::Stage {
+                            stage: flag,
+                            error,
+                        })?;
+                }
+
                 Some(hal::ProgrammableStage {
                     module: &shader_module.raw,
                     entry_point: fragment.stage.entry_point.as_ref(),
@@ -2865,6 +2898,17 @@ impl<A: HalApi> Device<A> {
             None => None,
         };
 
+        if !pipeline_expects_dual_source_blending && shader_expects_dual_source_blending {
+            return Err(
+                pipeline::CreateRenderPipelineError::ShaderExpectsPipelineToUseDualSourceBlending,
+            );
+        }
+        if pipeline_expects_dual_source_blending && !shader_expects_dual_source_blending {
+            return Err(
+                pipeline::CreateRenderPipelineError::PipelineExpectsShaderToUseDualSourceBlending,
+            );
+        }
+
         if validated_stages.contains(wgt::ShaderStages::FRAGMENT) {
             for (i, output) in io.iter() {
                 match color_targets.get(*i as usize) {
diff --git a/wgpu-core/src/pipeline.rs b/wgpu-core/src/pipeline.rs
index da06b652ea..c78a79820d 100644
--- a/wgpu-core/src/pipeline.rs
+++ b/wgpu-core/src/pipeline.rs
@@ -384,6 +384,15 @@ pub enum CreateRenderPipelineError {
     },
     #[error("In the provided shader, the type given for group {group} binding {binding} has a size of {size}. As the device does not support `DownlevelFlags::BUFFER_BINDINGS_NOT_16_BYTE_ALIGNED`, the type must have a size that is a multiple of 16 bytes.")]
     UnalignedShader { group: u32, binding: u32, size: u64 },
+    #[error("Using the blend factor {factor:?} for render target {target} is not possible. Only the first render target may be used when dual-source blending.")]
+    BlendFactorOnUnsupportedTarget {
+        factor: wgt::BlendFactor,
+        target: u32,
+    },
+    #[error("Pipeline expects the shader entry point to make use of dual-source blending.")]
+    PipelineExpectsShaderToUseDualSourceBlending,
+    #[error("Shader entry point expects the pipeline to make use of dual-source blending.")]
+    ShaderExpectsPipelineToUseDualSourceBlending,
 }
 
 bitflags::bitflags! {
diff --git a/wgpu-core/src/validation.rs b/wgpu-core/src/validation.rs
index e3ecb916d3..778cc26cd5 100644
--- a/wgpu-core/src/validation.rs
+++ b/wgpu-core/src/validation.rs
@@ -116,6 +116,7 @@ struct EntryPoint {
     spec_constants: Vec<SpecializationConstant>,
     sampling_pairs: FastHashSet<(naga::Handle<Resource>, naga::Handle<Resource>)>,
     workgroup_size: [u32; 3],
+    dual_source_blending: bool,
 }
 
 #[derive(Debug)]
@@ -903,7 +904,7 @@ impl Interface {
                 ep.sampling_pairs
                     .insert((resource_mapping[&key.image], resource_mapping[&key.sampler]));
             }
-
+            ep.dual_source_blending = info.dual_source_blending;
             ep.workgroup_size = entry_point.workgroup_size;
 
             entry_points.insert((entry_point.stage, entry_point.name.clone()), ep);
@@ -1177,4 +1178,15 @@ impl Interface {
             .collect();
         Ok(outputs)
     }
+
+    pub fn fragment_uses_dual_source_blending(
+        &self,
+        entry_point_name: &str,
+    ) -> Result<bool, StageError> {
+        let pair = (naga::ShaderStage::Fragment, entry_point_name.to_string());
+        self.entry_points
+            .get(&pair)
+            .ok_or(StageError::MissingEntryPoint(pair.1))
+            .map(|ep| ep.dual_source_blending)
+    }
 }
diff --git a/wgpu-hal/src/dx12/adapter.rs b/wgpu-hal/src/dx12/adapter.rs
index 02cde913ca..3959deeccd 100644
--- a/wgpu-hal/src/dx12/adapter.rs
+++ b/wgpu-hal/src/dx12/adapter.rs
@@ -250,7 +250,9 @@ impl super::Adapter {
             | wgt::Features::TEXTURE_FORMAT_16BIT_NORM
             | wgt::Features::PUSH_CONSTANTS
             | wgt::Features::SHADER_PRIMITIVE_INDEX
-            | wgt::Features::RG11B10UFLOAT_RENDERABLE;
+            | wgt::Features::RG11B10UFLOAT_RENDERABLE
+            | wgt::Features::DUAL_SOURCE_BLENDING;
+
         //TODO: in order to expose this, we need to run a compute shader
         // that extract the necessary statistics out of the D3D12 result.
         // Alternatively, we could allocate a buffer for the query set,
diff --git a/wgpu-hal/src/dx12/conv.rs b/wgpu-hal/src/dx12/conv.rs
index 908944567a..f484d1a9e2 100644
--- a/wgpu-hal/src/dx12/conv.rs
+++ b/wgpu-hal/src/dx12/conv.rs
@@ -246,12 +246,12 @@ fn map_blend_factor(factor: wgt::BlendFactor, is_alpha: bool) -> d3d12_ty::D3D12
         Bf::Constant => d3d12_ty::D3D12_BLEND_BLEND_FACTOR,
         Bf::OneMinusConstant => d3d12_ty::D3D12_BLEND_INV_BLEND_FACTOR,
         Bf::SrcAlphaSaturated => d3d12_ty::D3D12_BLEND_SRC_ALPHA_SAT,
-        //Bf::Src1Color if is_alpha => d3d12_ty::D3D12_BLEND_SRC1_ALPHA,
-        //Bf::Src1Color => d3d12_ty::D3D12_BLEND_SRC1_COLOR,
-        //Bf::OneMinusSrc1Color if is_alpha => d3d12_ty::D3D12_BLEND_INV_SRC1_ALPHA,
-        //Bf::OneMinusSrc1Color => d3d12_ty::D3D12_BLEND_INV_SRC1_COLOR,
-        //Bf::Src1Alpha => d3d12_ty::D3D12_BLEND_SRC1_ALPHA,
-        //Bf::OneMinusSrc1Alpha => d3d12_ty::D3D12_BLEND_INV_SRC1_ALPHA,
+        Bf::Src1 if is_alpha => d3d12_ty::D3D12_BLEND_SRC1_ALPHA,
+        Bf::Src1 => d3d12_ty::D3D12_BLEND_SRC1_COLOR,
+        Bf::OneMinusSrc1 if is_alpha => d3d12_ty::D3D12_BLEND_INV_SRC1_ALPHA,
+        Bf::OneMinusSrc1 => d3d12_ty::D3D12_BLEND_INV_SRC1_COLOR,
+        Bf::Src1Alpha => d3d12_ty::D3D12_BLEND_SRC1_ALPHA,
+        Bf::OneMinusSrc1Alpha => d3d12_ty::D3D12_BLEND_INV_SRC1_ALPHA,
     }
 }
 
diff --git a/wgpu-hal/src/gles/adapter.rs b/wgpu-hal/src/gles/adapter.rs
index 348f62bc03..3dae58b7c4 100644
--- a/wgpu-hal/src/gles/adapter.rs
+++ b/wgpu-hal/src/gles/adapter.rs
@@ -363,6 +363,10 @@ impl super::Adapter {
             wgt::Features::MULTIVIEW,
             extensions.contains("OVR_multiview2"),
         );
+        features.set(
+            wgt::Features::DUAL_SOURCE_BLENDING,
+            extensions.contains("GL_EXT_blend_func_extended"),
+        );
         features.set(
             wgt::Features::SHADER_PRIMITIVE_INDEX,
             ver >= (3, 2) || extensions.contains("OES_geometry_shader"),
diff --git a/wgpu-hal/src/gles/conv.rs b/wgpu-hal/src/gles/conv.rs
index dd5d764c6a..9bfac022a1 100644
--- a/wgpu-hal/src/gles/conv.rs
+++ b/wgpu-hal/src/gles/conv.rs
@@ -376,6 +376,10 @@ fn map_blend_factor(factor: wgt::BlendFactor) -> u32 {
         Bf::Constant => glow::CONSTANT_COLOR,
         Bf::OneMinusConstant => glow::ONE_MINUS_CONSTANT_COLOR,
         Bf::SrcAlphaSaturated => glow::SRC_ALPHA_SATURATE,
+        Bf::Src1 => glow::SRC1_COLOR,
+        Bf::OneMinusSrc1 => glow::ONE_MINUS_SRC1_COLOR,
+        Bf::Src1Alpha => glow::SRC1_ALPHA,
+        Bf::OneMinusSrc1Alpha => glow::ONE_MINUS_SRC1_ALPHA,
     }
 }
 
diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs
index 126741d257..da254442bc 100644
--- a/wgpu-hal/src/metal/adapter.rs
+++ b/wgpu-hal/src/metal/adapter.rs
@@ -833,6 +833,10 @@ impl super::PrivateCapabilities {
             self.timestamp_query_support
                 .contains(TimestampQuerySupport::INSIDE_WGPU_PASSES),
         );
+        features.set(
+            F::DUAL_SOURCE_BLENDING,
+            self.msl_version >= MTLLanguageVersion::V1_2 && self.dual_source_blending,
+        );
         features.set(F::TEXTURE_COMPRESSION_ASTC, self.format_astc);
         features.set(F::TEXTURE_COMPRESSION_ASTC_HDR, self.format_astc_hdr);
         features.set(F::TEXTURE_COMPRESSION_BC, self.format_bc);
diff --git a/wgpu-hal/src/metal/conv.rs b/wgpu-hal/src/metal/conv.rs
index a1ceb287ab..8f6439b50b 100644
--- a/wgpu-hal/src/metal/conv.rs
+++ b/wgpu-hal/src/metal/conv.rs
@@ -152,13 +152,11 @@ pub fn map_blend_factor(factor: wgt::BlendFactor) -> metal::MTLBlendFactor {
         Bf::OneMinusDstAlpha => OneMinusDestinationAlpha,
         Bf::Constant => BlendColor,
         Bf::OneMinusConstant => OneMinusBlendColor,
-        //Bf::ConstantAlpha => BlendAlpha,
-        //Bf::OneMinusConstantAlpha => OneMinusBlendAlpha,
         Bf::SrcAlphaSaturated => SourceAlphaSaturated,
-        //Bf::Src1 => Source1Color,
-        //Bf::OneMinusSrc1 => OneMinusSource1Color,
-        //Bf::Src1Alpha => Source1Alpha,
-        //Bf::OneMinusSrc1Alpha => OneMinusSource1Alpha,
+        Bf::Src1 => Source1Color,
+        Bf::OneMinusSrc1 => OneMinusSource1Color,
+        Bf::Src1Alpha => Source1Alpha,
+        Bf::OneMinusSrc1Alpha => OneMinusSource1Alpha,
     }
 }
 
diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs
index bcbab85084..78aceeeeef 100644
--- a/wgpu-hal/src/vulkan/adapter.rs
+++ b/wgpu-hal/src/vulkan/adapter.rs
@@ -177,6 +177,7 @@ impl PhysicalDeviceFeatures {
                 //.shader_resource_residency(requested_features.contains(wgt::Features::SHADER_RESOURCE_RESIDENCY))
                 .geometry_shader(requested_features.contains(wgt::Features::SHADER_PRIMITIVE_INDEX))
                 .depth_clamp(requested_features.contains(wgt::Features::DEPTH_CLIP_CONTROL))
+                .dual_src_blend(requested_features.contains(wgt::Features::DUAL_SOURCE_BLENDING))
                 .build(),
             descriptor_indexing: if requested_features.intersects(indexing_features()) {
                 Some(
@@ -460,6 +461,7 @@ impl PhysicalDeviceFeatures {
         }
 
         features.set(F::DEPTH_CLIP_CONTROL, self.core.depth_clamp != 0);
+        features.set(F::DUAL_SOURCE_BLENDING, self.core.dual_src_blend != 0);
 
         if let Some(ref multiview) = self.multiview {
             features.set(F::MULTIVIEW, multiview.multiview != 0);
diff --git a/wgpu-hal/src/vulkan/conv.rs b/wgpu-hal/src/vulkan/conv.rs
index e2398c2689..459b7f858f 100644
--- a/wgpu-hal/src/vulkan/conv.rs
+++ b/wgpu-hal/src/vulkan/conv.rs
@@ -792,6 +792,10 @@ fn map_blend_factor(factor: wgt::BlendFactor) -> vk::BlendFactor {
         Bf::SrcAlphaSaturated => vk::BlendFactor::SRC_ALPHA_SATURATE,
         Bf::Constant => vk::BlendFactor::CONSTANT_COLOR,
         Bf::OneMinusConstant => vk::BlendFactor::ONE_MINUS_CONSTANT_COLOR,
+        Bf::Src1 => vk::BlendFactor::SRC1_COLOR,
+        Bf::OneMinusSrc1 => vk::BlendFactor::ONE_MINUS_SRC1_COLOR,
+        Bf::Src1Alpha => vk::BlendFactor::SRC1_ALPHA,
+        Bf::OneMinusSrc1Alpha => vk::BlendFactor::ONE_MINUS_SRC1_ALPHA,
     }
 }
 
diff --git a/wgpu-types/src/lib.rs b/wgpu-types/src/lib.rs
index 9f61e2e490..e08b802094 100644
--- a/wgpu-types/src/lib.rs
+++ b/wgpu-types/src/lib.rs
@@ -781,7 +781,17 @@ bitflags::bitflags! {
         /// This is a native only feature.
         const SHADER_EARLY_DEPTH_TEST = 1 << 62;
 
-        // 62..64 available
+        /// Allows two outputs from a shader to be used for blending.
+        /// Note that dual-source blending doesn't support multiple render targets.
+        ///
+        /// For more info see the OpenGL ES extension GL_EXT_blend_func_extended.
+        ///
+        /// Supported platforms:
+        /// - OpenGL ES (with GL_EXT_blend_func_extended)
+        /// - Metal (with MSL 1.2+)
+        /// - Vulkan (with dualSrcBlend)
+        /// - DX12
+        const DUAL_SOURCE_BLENDING = 1 << 63;
     }
 }
 
@@ -1549,6 +1559,8 @@ impl TextureViewDimension {
 ///
 /// Corresponds to [WebGPU `GPUBlendFactor`](
 /// https://gpuweb.github.io/gpuweb/#enumdef-gpublendfactor).
+/// Values using S1 requires [`Features::DUAL_SOURCE_BLENDING`] and can only be
+/// used with the first render target.
 #[repr(C)]
 #[derive(Copy, Clone, Debug, Hash, Eq, PartialEq)]
 #[cfg_attr(feature = "trace", derive(Serialize))]
@@ -1581,6 +1593,29 @@ pub enum BlendFactor {
     Constant = 11,
     /// 1.0 - Constant
     OneMinusConstant = 12,
+    /// S1.component
+    Src1 = 13,
+    /// 1.0 - S1.component
+    OneMinusSrc1 = 14,
+    /// S1.alpha
+    Src1Alpha = 15,
+    /// 1.0 - S1.alpha
+    OneMinusSrc1Alpha = 16,
+}
+
+impl BlendFactor {
+    /// Returns `true` if the blend factor references the second blend source.
+    ///
+    /// Note that the usage of those blend factors require [`Features::DUAL_SOURCE_BLENDING`].
+    pub fn ref_second_blend_source(&self) -> bool {
+        match self {
+            BlendFactor::Src1
+            | BlendFactor::OneMinusSrc1
+            | BlendFactor::Src1Alpha
+            | BlendFactor::OneMinusSrc1Alpha => true,
+            _ => false,
+        }
+    }
 }
 
 /// Alpha blend operation.
diff --git a/wgpu/src/backend/web.rs b/wgpu/src/backend/web.rs
index d457681e57..b649d41ebb 100644
--- a/wgpu/src/backend/web.rs
+++ b/wgpu/src/backend/web.rs
@@ -421,6 +421,15 @@ fn map_blend_factor(factor: wgt::BlendFactor) -> web_sys::GpuBlendFactor {
         BlendFactor::SrcAlphaSaturated => bf::SrcAlphaSaturated,
         BlendFactor::Constant => bf::Constant,
         BlendFactor::OneMinusConstant => bf::OneMinusConstant,
+        BlendFactor::Src1
+        | BlendFactor::OneMinusSrc1
+        | BlendFactor::Src1Alpha
+        | BlendFactor::OneMinusSrc1Alpha => {
+            panic!(
+                "{:?} is not enabled for this backend",
+                wgt::Features::DUAL_SOURCE_BLENDING
+            )
+        }
     }
 }
 

From 82f0cd9ee655fc14af04693e9d30c5e3130161ce Mon Sep 17 00:00:00 2001
From: Alphyr <47725341+a1phyr@users.noreply.github.com>
Date: Wed, 20 Sep 2023 20:45:09 +0200
Subject: [PATCH 10/14] Fix `Features::TEXTURE_COMPRESSION_ASTC*` doc (#4157)

Co-authored-by: Connor Fitzgerald <connorwadefitzgerald@gmail.com>
---
 CHANGELOG.md          | 3 ++-
 wgpu-types/src/lib.rs | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d60e0cd84f..4a54939a25 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -110,7 +110,8 @@ By @wumpf in [#4147](https://github.com/gfx-rs/wgpu/pull/4147)
 
 
 ### Documentation
-- Use WGSL for VertexFormat example types. By @ScanMountGoat in [#4305](https://github.com/gfx-rs/wgpu/pull/4035)
+- Use WGSL for VertexFormat example types. By @ScanMountGoat in [#4035](https://github.com/gfx-rs/wgpu/pull/4035)
+- Fix description of `Features::TEXTURE_COMPRESSION_ASTC_HDR` in [#4157](https://github.com/gfx-rs/wgpu/pull/4157)
 
 #### Metal
 
diff --git a/wgpu-types/src/lib.rs b/wgpu-types/src/lib.rs
index e08b802094..59c32ec3b8 100644
--- a/wgpu-types/src/lib.rs
+++ b/wgpu-types/src/lib.rs
@@ -371,7 +371,7 @@ bitflags::bitflags! {
         /// Compressed textures sacrifice some quality in exchange for significantly reduced
         /// bandwidth usage.
         ///
-        /// Support for this feature guarantees availability of [`TextureUsages::COPY_SRC | TextureUsages::COPY_DST | TextureUsages::TEXTURE_BINDING`] for ASTC formats.
+        /// Support for this feature guarantees availability of [`TextureUsages::COPY_SRC | TextureUsages::COPY_DST | TextureUsages::TEXTURE_BINDING`] for ASTC formats with Unorm/UnormSrgb channel type.
         /// [`Features::TEXTURE_ADAPTER_SPECIFIC_FORMAT_FEATURES`] may enable additional usages.
         ///
         /// Supported Platforms:
@@ -409,7 +409,7 @@ bitflags::bitflags! {
         /// Compressed textures sacrifice some quality in exchange for significantly reduced
         /// bandwidth usage.
         ///
-        /// Support for this feature guarantees availability of [`TextureUsages::COPY_SRC | TextureUsages::COPY_DST | TextureUsages::TEXTURE_BINDING`] for BCn formats.
+        /// Support for this feature guarantees availability of [`TextureUsages::COPY_SRC | TextureUsages::COPY_DST | TextureUsages::TEXTURE_BINDING`] for ASTC formats with the HDR channel type.
         /// [`Features::TEXTURE_ADAPTER_SPECIFIC_FORMAT_FEATURES`] may enable additional usages.
         ///
         /// Supported Platforms:

From 3c37c2afe525d9ea28c3b525b1f5a927a1ce13fb Mon Sep 17 00:00:00 2001
From: Neil Sarkar <neil.r.sarkar@gmail.com>
Date: Wed, 20 Sep 2023 12:07:40 -0700
Subject: [PATCH 11/14] Bring back xtask alias (#4160)

---
 .cargo/config.toml | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 .cargo/config.toml

diff --git a/.cargo/config.toml b/.cargo/config.toml
new file mode 100644
index 0000000000..95d2a35175
--- /dev/null
+++ b/.cargo/config.toml
@@ -0,0 +1,7 @@
+[alias]
+xtask = "run --manifest-path xtask/Cargo.toml --"
+
+[build]
+rustflags = [
+"--cfg=web_sys_unstable_apis"
+]

From 2b4a8b318fb3717473b7da20b028dbc1f58ee9a2 Mon Sep 17 00:00:00 2001
From: Aaron Hill <aa1ronham@gmail.com>
Date: Wed, 20 Sep 2023 21:02:37 -0400
Subject: [PATCH 12/14] wgpu-core: Only produce StageError::InputNotConsumed on
 DX11/DX12 (#4116)

* wgpu-core: Only produce StageError::InputNotConsumed on DX11/DX12

This error only exists due to an issue with naga's HLSL support:
https://github.com/gfx-rs/naga/issues/1945
The WGPU spec itself allows vertex shader outputs that are
not consumed by the fragment shader.

Until the issue is fixed, we can allow unconsumed outputs on
all platforms other than DX11/DX12.

* Add Features::SHADER_UNUSED_VERTEX_OUTPUT to allow disabling check

* Pick an unused feature id
---
 CHANGELOG.md                     |  1 +
 deno_webgpu/lib.rs               |  7 +++++++
 wgpu-core/src/device/resource.rs |  3 ++-
 wgpu-core/src/validation.rs      | 16 ++++++++++++++--
 wgpu-hal/src/gles/adapter.rs     |  1 +
 wgpu-hal/src/metal/adapter.rs    |  1 +
 wgpu-hal/src/vulkan/adapter.rs   |  1 +
 wgpu-types/src/lib.rs            |  9 +++++++++
 8 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4a54939a25..08d20c09fa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -124,6 +124,7 @@ By @wumpf in [#4147](https://github.com/gfx-rs/wgpu/pull/4147)
 - Derive storage bindings via `naga::StorageAccess` instead of `naga::GlobalUse`. By @teoxoy in [#3985](https://github.com/gfx-rs/wgpu/pull/3985).
 - `Queue::on_submitted_work_done` callbacks will now always be called after all previous `BufferSlice::map_async` callbacks, even when there are no active submissions. By @cwfitzgerald in [#4036](https://github.com/gfx-rs/wgpu/pull/4036).
 - Fix `clear` texture views being leaked when `wgpu::SurfaceTexture` is dropped before it is presented. By @rajveermalviya in [#4057](https://github.com/gfx-rs/wgpu/pull/4057).
+- Add `Feature::SHADER_UNUSED_VERTEX_OUTPUT` to allow unused vertex shader outputs. By @Aaron1011 in [#4116](https://github.com/gfx-rs/wgpu/pull/4116).
 
 #### Vulkan
 - Fix enabling `wgpu::Features::PARTIALLY_BOUND_BINDING_ARRAY` not being actually enabled in vulkan backend. By @39ali in[#3772](https://github.com/gfx-rs/wgpu/pull/3772).
diff --git a/deno_webgpu/lib.rs b/deno_webgpu/lib.rs
index 92a6a51334..bf61e42517 100644
--- a/deno_webgpu/lib.rs
+++ b/deno_webgpu/lib.rs
@@ -365,6 +365,9 @@ fn deserialize_features(features: &wgpu_types::Features) -> Vec<&'static str> {
     if features.contains(wgpu_types::Features::SHADER_EARLY_DEPTH_TEST) {
         return_features.push("shader-early-depth-test");
     }
+    if features.contains(wgpu_types::Features::SHADER_UNUSED_VERTEX_OUTPUT) {
+        return_features.push("shader-unused-vertex-output");
+    }
 
     return_features
 }
@@ -625,6 +628,10 @@ impl From<GpuRequiredFeatures> for wgpu_types::Features {
             wgpu_types::Features::SHADER_EARLY_DEPTH_TEST,
             required_features.0.contains("shader-early-depth-test"),
         );
+        features.set(
+            wgpu_types::Features::SHADER_UNUSED_VERTEX_OUTPUT,
+            required_features.0.contains("shader-unused-vertex-output"),
+        );
 
         features
     }
diff --git a/wgpu-core/src/device/resource.rs b/wgpu-core/src/device/resource.rs
index ba7006d3d5..8acc34acf4 100644
--- a/wgpu-core/src/device/resource.rs
+++ b/wgpu-core/src/device/resource.rs
@@ -1290,7 +1290,8 @@ impl<A: HalApi> Device<A> {
                     inner: Box::new(inner),
                 })
             })?;
-        let interface = validation::Interface::new(&module, &info, self.limits.clone());
+        let interface =
+            validation::Interface::new(&module, &info, self.limits.clone(), self.features);
         let hal_shader = hal::ShaderInput::Naga(hal::NagaShader { module, info });
 
         let hal_desc = hal::ShaderModuleDescriptor {
diff --git a/wgpu-core/src/validation.rs b/wgpu-core/src/validation.rs
index 778cc26cd5..ef5c65ed00 100644
--- a/wgpu-core/src/validation.rs
+++ b/wgpu-core/src/validation.rs
@@ -122,6 +122,7 @@ struct EntryPoint {
 #[derive(Debug)]
 pub struct Interface {
     limits: wgt::Limits,
+    features: wgt::Features,
     resources: naga::Arena<Resource>,
     entry_points: FastHashMap<(naga::ShaderStage, String), EntryPoint>,
 }
@@ -831,7 +832,12 @@ impl Interface {
         list.push(varying);
     }
 
-    pub fn new(module: &naga::Module, info: &naga::valid::ModuleInfo, limits: wgt::Limits) -> Self {
+    pub fn new(
+        module: &naga::Module,
+        info: &naga::valid::ModuleInfo,
+        limits: wgt::Limits,
+        features: wgt::Features,
+    ) -> Self {
         let mut resources = naga::Arena::new();
         let mut resource_mapping = FastHashMap::default();
         for (var_handle, var) in module.global_variables.iter() {
@@ -912,6 +918,7 @@ impl Interface {
 
         Self {
             limits,
+            features,
             resources,
             entry_points,
         }
@@ -1121,7 +1128,12 @@ impl Interface {
         }
 
         // Check all vertex outputs and make sure the fragment shader consumes them.
-        if shader_stage == naga::ShaderStage::Fragment {
+        // This requirement is removed if the `SHADER_UNUSED_VERTEX_OUTPUT` feature is enabled.
+        if shader_stage == naga::ShaderStage::Fragment
+            && !self
+                .features
+                .contains(wgt::Features::SHADER_UNUSED_VERTEX_OUTPUT)
+        {
             for &index in inputs.keys() {
                 // This is a linear scan, but the count should be low enough
                 // that this should be fine.
diff --git a/wgpu-hal/src/gles/adapter.rs b/wgpu-hal/src/gles/adapter.rs
index 3dae58b7c4..cbbcf7399e 100644
--- a/wgpu-hal/src/gles/adapter.rs
+++ b/wgpu-hal/src/gles/adapter.rs
@@ -372,6 +372,7 @@ impl super::Adapter {
             ver >= (3, 2) || extensions.contains("OES_geometry_shader"),
         );
         features.set(wgt::Features::SHADER_EARLY_DEPTH_TEST, ver >= (3, 1));
+        features.set(wgt::Features::SHADER_UNUSED_VERTEX_OUTPUT, true);
         let gles_bcn_exts = [
             "GL_EXT_texture_compression_s3tc_srgb",
             "GL_EXT_texture_compression_rgtc",
diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs
index da254442bc..c4617deaa0 100644
--- a/wgpu-hal/src/metal/adapter.rs
+++ b/wgpu-hal/src/metal/adapter.rs
@@ -871,6 +871,7 @@ impl super::PrivateCapabilities {
         features.set(F::ADDRESS_MODE_CLAMP_TO_ZERO, true);
 
         features.set(F::RG11B10UFLOAT_RENDERABLE, self.format_rg11b10_all);
+        features.set(F::SHADER_UNUSED_VERTEX_OUTPUT, true);
 
         features
     }
diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs
index 78aceeeeef..b515628726 100644
--- a/wgpu-hal/src/vulkan/adapter.rs
+++ b/wgpu-hal/src/vulkan/adapter.rs
@@ -522,6 +522,7 @@ impl PhysicalDeviceFeatures {
                 | vk::FormatFeatureFlags::COLOR_ATTACHMENT_BLEND,
         );
         features.set(F::RG11B10UFLOAT_RENDERABLE, rg11b10ufloat_renderable);
+        features.set(F::SHADER_UNUSED_VERTEX_OUTPUT, true);
 
         (features, dl_flags)
     }
diff --git a/wgpu-types/src/lib.rs b/wgpu-types/src/lib.rs
index 59c32ec3b8..13603fc03f 100644
--- a/wgpu-types/src/lib.rs
+++ b/wgpu-types/src/lib.rs
@@ -737,6 +737,15 @@ bitflags::bitflags! {
         /// This is a native only feature.
         const VERTEX_ATTRIBUTE_64BIT = 1 << 53;
 
+        /// Allows vertex shaders to have outputs which are not consumed
+        /// by the fragment shader.
+        ///
+        /// Supported platforms:
+        /// - Vulkan
+        /// - Metal
+        /// - OpenGL
+        const SHADER_UNUSED_VERTEX_OUTPUT = 1 << 54;
+
         // 54..59 available
 
         // Shader:

From 3ff04c31e6b9b5f8b0589e6dfa55aecb91d1ce8b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 21 Sep 2023 02:36:45 +0000
Subject: [PATCH 13/14] Bump smallvec from 1.11.0 to 1.11.1 (#4161)

Bumps [smallvec](https://github.com/servo/rust-smallvec) from 1.11.0 to 1.11.1.
- [Release notes](https://github.com/servo/rust-smallvec/releases)
- [Commits](https://github.com/servo/rust-smallvec/compare/v1.11.0...v1.11.1)

---
updated-dependencies:
- dependency-name: smallvec
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9978941db4..942cf10970 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2531,9 +2531,9 @@ dependencies = [
 
 [[package]]
 name = "smallvec"
-version = "1.11.0"
+version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
+checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a"
 
 [[package]]
 name = "smithay-client-toolkit"

From 855fefc10e14caa7c262c9a21a1db25a821f3c96 Mon Sep 17 00:00:00 2001
From: Connor Fitzgerald <connorwadefitzgerald@gmail.com>
Date: Thu, 21 Sep 2023 02:01:35 -0400
Subject: [PATCH 14/14] Update CODEOWNERS with new wgpu team (#4162)

---
 .github/CODEOWNERS | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 7fefad320c..1d6e147803 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,2 +1,4 @@
-/cts_runner/    @crowlKats
-/deno_webgpu/   @crowlKats
+* @gfx-rs/wgpu
+
+/cts_runner/    @gfx-rs/deno @gfx-rs/wgpu
+/deno_webgpu/   @gfx-rs/deno @gfx-rs/wgpu