From 1df98d9888598ca72939a459b606f36570a1fa0f Mon Sep 17 00:00:00 2001
From: Connor Fitzgerald <connorwadefitzgerald@gmail.com>
Date: Tue, 21 Nov 2023 17:11:24 -0500
Subject: [PATCH] Test And Normalize Vertex Behavior on All Backends (#4723)

Co-authored-by: teoxoy <28601907+teoxoy@users.noreply.github.com>
---
 CHANGELOG.md                                  |  11 +-
 d3d12/src/command_list.rs                     |  14 +-
 naga/src/back/glsl/features.rs                |  54 +-
 naga/src/back/glsl/keywords.rs                |   1 +
 naga/src/back/glsl/mod.rs                     | 105 ++--
 naga/src/back/hlsl/writer.rs                  |  18 +-
 naga/tests/in/push-constants.wgsl             |   3 +-
 .../glsl/push-constants.vert_main.Vertex.glsl |   7 +-
 naga/tests/out/hlsl/interface.hlsl            |  10 +-
 naga/tests/out/hlsl/push-constants.hlsl       |  10 +-
 naga/tests/out/hlsl/skybox.hlsl               |   8 +-
 tests/tests/mem_leaks.rs                      |   4 +-
 tests/tests/vertex_indices/draw.vert.wgsl     |  11 +-
 tests/tests/vertex_indices/mod.rs             | 525 +++++++++++++-----
 wgpu-core/src/device/resource.rs              |   2 +-
 wgpu-hal/src/dx11/command.rs                  |   8 +-
 wgpu-hal/src/dx12/adapter.rs                  |   7 +-
 wgpu-hal/src/dx12/command.rs                  |  58 +-
 wgpu-hal/src/dx12/device.rs                   |   4 +-
 wgpu-hal/src/dx12/mod.rs                      |   4 +-
 wgpu-hal/src/empty.rs                         |   8 +-
 wgpu-hal/src/gles/adapter.rs                  |  20 +-
 wgpu-hal/src/gles/command.rs                  |  47 +-
 wgpu-hal/src/gles/device.rs                   |  14 +
 wgpu-hal/src/gles/mod.rs                      |  68 ++-
 wgpu-hal/src/gles/queue.rs                    | 129 +++--
 wgpu-hal/src/lib.rs                           |  12 +-
 wgpu-hal/src/metal/adapter.rs                 |  36 +-
 wgpu-hal/src/metal/command.rs                 |  24 +-
 wgpu-hal/src/metal/mod.rs                     |   4 +-
 wgpu-hal/src/vulkan/adapter.rs                |  17 +-
 wgpu-hal/src/vulkan/command.rs                |  16 +-
 wgpu-types/src/lib.rs                         | 124 ++++-
 wgpu/src/lib.rs                               |  99 ++--
 wgpu/src/util/encoder.rs                      |   4 +-
 wgpu/src/util/indirect.rs                     |  81 ---
 wgpu/src/util/mod.rs                          |   4 +-
 37 files changed, 1068 insertions(+), 503 deletions(-)
 delete mode 100644 wgpu/src/util/indirect.rs

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d75dabde03..7519428f38 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -40,9 +40,17 @@ Bottom level categories:
 
 ## Unreleased
 
+### New Features
+
+#### General
+- Added `DownlevelFlags::VERTEX_AND_INSTANCE_INDEX_RESPECTS_RESPECTIVE_FIRST_VALUE_IN_INDIRECT_DRAW` to know if `@builtin(vertex_index)` and `@builtin(instance_index)` will respect the `first_vertex` / `first_instance` in indirect calls. If this is not present, both will always start counting from 0. Currently enabled on all backends except DX12. By @cwfitzgerald in [#4722](https://github.com/gfx-rs/wgpu/pull/4722)
+
+#### OpenGL
+- `@builtin(instance_index)` now properly reflects the range provided in the draw call instead of always counting from 0. By @cwfitzgerald in [#4722](https://github.com/gfx-rs/wgpu/pull/4722).
+
 ### Changes
 
-- Arcanization of wgpu core resources: 
+- Arcanization of wgpu core resources:
 Removed Token and LifeTime related management
 Removed RefCount and MultiRefCount in favour of using only Arc internal reference count
 Removing mut from resources and added instead internal members locks on demand or atomics operations
@@ -65,6 +73,7 @@ By @gents83 in [#3626](https://github.com/gfx-rs/wgpu/pull/3626) and tnx also to
 
 - Log vulkan validation layer messages during instance creation and destruction: By @exrook in [#4586](https://github.com/gfx-rs/wgpu/pull/4586)
 - `TextureFormat::block_size` is deprecated, use `TextureFormat::block_copy_size` instead: By @wumpf in [#4647](https://github.com/gfx-rs/wgpu/pull/4647)
+- Rename of `DispatchIndirect`, `DrawIndexedIndirect`, and `DrawIndirect` types in the `wgpu::util` module to `DispatchIndirectArgs`, `DrawIndexedIndirectArgs`, and `DrawIndirectArgs`. By @cwfitzgerald in [#4723](https://github.com/gfx-rs/wgpu/pull/4723).
 
 #### Safe `Surface` creation
 
diff --git a/d3d12/src/command_list.rs b/d3d12/src/command_list.rs
index 168d935e30..1f8c0d53c2 100644
--- a/d3d12/src/command_list.rs
+++ b/d3d12/src/command_list.rs
@@ -213,11 +213,11 @@ impl GraphicsCommandList {
         &self,
         num_vertices: VertexCount,
         num_instances: InstanceCount,
-        start_vertex: VertexCount,
-        start_instance: InstanceCount,
+        first_vertex: VertexCount,
+        first_instance: InstanceCount,
     ) {
         unsafe {
-            self.DrawInstanced(num_vertices, num_instances, start_vertex, start_instance);
+            self.DrawInstanced(num_vertices, num_instances, first_vertex, first_instance);
         }
     }
 
@@ -225,17 +225,17 @@ impl GraphicsCommandList {
         &self,
         num_indices: IndexCount,
         num_instances: InstanceCount,
-        start_index: IndexCount,
+        first_index: IndexCount,
         base_vertex: VertexOffset,
-        start_instance: InstanceCount,
+        first_instance: InstanceCount,
     ) {
         unsafe {
             self.DrawIndexedInstanced(
                 num_indices,
                 num_instances,
-                start_index,
+                first_index,
                 base_vertex,
-                start_instance,
+                first_instance,
             );
         }
     }
diff --git a/naga/src/back/glsl/features.rs b/naga/src/back/glsl/features.rs
index 49d028fb03..aaebfde9cb 100644
--- a/naga/src/back/glsl/features.rs
+++ b/naga/src/back/glsl/features.rs
@@ -1,5 +1,6 @@
 use super::{BackendResult, Error, Version, Writer};
 use crate::{
+    back::glsl::{Options, WriterFlags},
     AddressSpace, Binding, Expression, Handle, ImageClass, ImageDimension, Interpolation, Sampling,
     Scalar, ScalarKind, ShaderStage, StorageFormat, Type, TypeInner,
 };
@@ -43,6 +44,10 @@ bitflags::bitflags! {
         const IMAGE_SIZE = 1 << 20;
         /// Dual source blending
         const DUAL_SOURCE_BLENDING = 1 << 21;
+        /// Instance index
+        ///
+        /// We can always support this, either through the language or a polyfill
+        const INSTANCE_INDEX = 1 << 22;
     }
 }
 
@@ -63,6 +68,11 @@ impl FeaturesManager {
         self.0 |= features
     }
 
+    /// Checks if the list of features [`Features`] contains the specified [`Features`]
+    pub fn contains(&mut self, features: Features) -> bool {
+        self.0.contains(features)
+    }
+
     /// Checks that all required [`Features`] are available for the specified
     /// [`Version`] otherwise returns an [`Error::MissingFeatures`].
     pub fn check_availability(&self, version: Version) -> BackendResult {
@@ -129,13 +139,13 @@ impl FeaturesManager {
     /// # Notes
     /// This won't check for feature availability so it might output extensions that aren't even
     /// supported.[`check_availability`](Self::check_availability) will check feature availability
-    pub fn write(&self, version: Version, mut out: impl Write) -> BackendResult {
-        if self.0.contains(Features::COMPUTE_SHADER) && !version.is_es() {
+    pub fn write(&self, options: &Options, mut out: impl Write) -> BackendResult {
+        if self.0.contains(Features::COMPUTE_SHADER) && !options.version.is_es() {
             // https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_compute_shader.txt
             writeln!(out, "#extension GL_ARB_compute_shader : require")?;
         }
 
-        if self.0.contains(Features::BUFFER_STORAGE) && !version.is_es() {
+        if self.0.contains(Features::BUFFER_STORAGE) && !options.version.is_es() {
             // https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_shader_storage_buffer_object.txt
             writeln!(
                 out,
@@ -143,22 +153,22 @@ impl FeaturesManager {
             )?;
         }
 
-        if self.0.contains(Features::DOUBLE_TYPE) && version < Version::Desktop(400) {
+        if self.0.contains(Features::DOUBLE_TYPE) && options.version < Version::Desktop(400) {
             // https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_gpu_shader_fp64.txt
             writeln!(out, "#extension GL_ARB_gpu_shader_fp64 : require")?;
         }
 
         if self.0.contains(Features::CUBE_TEXTURES_ARRAY) {
-            if version.is_es() {
+            if options.version.is_es() {
                 // https://www.khronos.org/registry/OpenGL/extensions/EXT/EXT_texture_cube_map_array.txt
                 writeln!(out, "#extension GL_EXT_texture_cube_map_array : require")?;
-            } else if version < Version::Desktop(400) {
+            } else if options.version < Version::Desktop(400) {
                 // https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_cube_map_array.txt
                 writeln!(out, "#extension GL_ARB_texture_cube_map_array : require")?;
             }
         }
 
-        if self.0.contains(Features::MULTISAMPLED_TEXTURE_ARRAYS) && version.is_es() {
+        if self.0.contains(Features::MULTISAMPLED_TEXTURE_ARRAYS) && options.version.is_es() {
             // https://www.khronos.org/registry/OpenGL/extensions/OES/OES_texture_storage_multisample_2d_array.txt
             writeln!(
                 out,
@@ -166,49 +176,49 @@ impl FeaturesManager {
             )?;
         }
 
-        if self.0.contains(Features::ARRAY_OF_ARRAYS) && version < Version::Desktop(430) {
+        if self.0.contains(Features::ARRAY_OF_ARRAYS) && options.version < Version::Desktop(430) {
             // https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_arrays_of_arrays.txt
             writeln!(out, "#extension ARB_arrays_of_arrays : require")?;
         }
 
         if self.0.contains(Features::IMAGE_LOAD_STORE) {
-            if self.0.contains(Features::FULL_IMAGE_FORMATS) && version.is_es() {
+            if self.0.contains(Features::FULL_IMAGE_FORMATS) && options.version.is_es() {
                 // https://www.khronos.org/registry/OpenGL/extensions/NV/NV_image_formats.txt
                 writeln!(out, "#extension GL_NV_image_formats : require")?;
             }
 
-            if version < Version::Desktop(420) {
+            if options.version < Version::Desktop(420) {
                 // https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_shader_image_load_store.txt
                 writeln!(out, "#extension GL_ARB_shader_image_load_store : require")?;
             }
         }
 
         if self.0.contains(Features::CONSERVATIVE_DEPTH) {
-            if version.is_es() {
+            if options.version.is_es() {
                 // https://www.khronos.org/registry/OpenGL/extensions/EXT/EXT_conservative_depth.txt
                 writeln!(out, "#extension GL_EXT_conservative_depth : require")?;
             }
 
-            if version < Version::Desktop(420) {
+            if options.version < Version::Desktop(420) {
                 // https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_conservative_depth.txt
                 writeln!(out, "#extension GL_ARB_conservative_depth : require")?;
             }
         }
 
         if (self.0.contains(Features::CLIP_DISTANCE) || self.0.contains(Features::CULL_DISTANCE))
-            && version.is_es()
+            && options.version.is_es()
         {
             // https://www.khronos.org/registry/OpenGL/extensions/EXT/EXT_clip_cull_distance.txt
             writeln!(out, "#extension GL_EXT_clip_cull_distance : require")?;
         }
 
-        if self.0.contains(Features::SAMPLE_VARIABLES) && version.is_es() {
+        if self.0.contains(Features::SAMPLE_VARIABLES) && options.version.is_es() {
             // https://www.khronos.org/registry/OpenGL/extensions/OES/OES_sample_variables.txt
             writeln!(out, "#extension GL_OES_sample_variables : require")?;
         }
 
         if self.0.contains(Features::MULTI_VIEW) {
-            if let Version::Embedded { is_webgl: true, .. } = version {
+            if let Version::Embedded { is_webgl: true, .. } = options.version {
                 // https://www.khronos.org/registry/OpenGL/extensions/OVR/OVR_multiview2.txt
                 writeln!(out, "#extension GL_OVR_multiview2 : require")?;
             } else {
@@ -225,15 +235,22 @@ impl FeaturesManager {
             )?;
         }
 
-        if self.0.contains(Features::TEXTURE_LEVELS) && version < Version::Desktop(430) {
+        if self.0.contains(Features::TEXTURE_LEVELS) && options.version < Version::Desktop(430) {
             // https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_query_levels.txt
             writeln!(out, "#extension GL_ARB_texture_query_levels : require")?;
         }
-        if self.0.contains(Features::DUAL_SOURCE_BLENDING) && version.is_es() {
+        if self.0.contains(Features::DUAL_SOURCE_BLENDING) && options.version.is_es() {
             // https://registry.khronos.org/OpenGL/extensions/EXT/EXT_blend_func_extended.txt
             writeln!(out, "#extension GL_EXT_blend_func_extended : require")?;
         }
 
+        if self.0.contains(Features::INSTANCE_INDEX) {
+            if options.writer_flags.contains(WriterFlags::DRAW_PARAMETERS) {
+                // https://registry.khronos.org/OpenGL/extensions/ARB/ARB_shader_draw_parameters.txt
+                writeln!(out, "#extension GL_ARB_shader_draw_parameters : require")?;
+            }
+        }
+
         Ok(())
     }
 }
@@ -490,6 +507,9 @@ impl<'a, W> Writer<'a, W> {
                             crate::BuiltIn::ViewIndex => {
                                 self.features.request(Features::MULTI_VIEW)
                             }
+                            crate::BuiltIn::InstanceIndex => {
+                                self.features.request(Features::INSTANCE_INDEX)
+                            }
                             _ => {}
                         },
                         Binding::Location {
diff --git a/naga/src/back/glsl/keywords.rs b/naga/src/back/glsl/keywords.rs
index afadd6e7f1..857c935e68 100644
--- a/naga/src/back/glsl/keywords.rs
+++ b/naga/src/back/glsl/keywords.rs
@@ -480,4 +480,5 @@ pub const RESERVED_KEYWORDS: &[&str] = &[
     // Naga utilities:
     super::MODF_FUNCTION,
     super::FREXP_FUNCTION,
+    super::FIRST_INSTANCE_BINDING,
 ];
diff --git a/naga/src/back/glsl/mod.rs b/naga/src/back/glsl/mod.rs
index d66b606abc..cd3075f70a 100644
--- a/naga/src/back/glsl/mod.rs
+++ b/naga/src/back/glsl/mod.rs
@@ -76,6 +76,9 @@ const CLAMPED_LOD_SUFFIX: &str = "_clamped_lod";
 pub(crate) const MODF_FUNCTION: &str = "naga_modf";
 pub(crate) const FREXP_FUNCTION: &str = "naga_frexp";
 
+// Must match code in glsl_built_in
+pub const FIRST_INSTANCE_BINDING: &str = "naga_vs_first_instance";
+
 /// Mapping between resources and bindings.
 pub type BindingMap = std::collections::BTreeMap<crate::ResourceBinding, u8>;
 
@@ -235,17 +238,20 @@ bitflags::bitflags! {
         /// Supports GL_EXT_texture_shadow_lod on the host, which provides
         /// additional functions on shadows and arrays of shadows.
         const TEXTURE_SHADOW_LOD = 0x2;
+        /// Supports ARB_shader_draw_parameters on the host, which provides
+        /// support for `gl_BaseInstanceARB`, `gl_BaseVertexARB`, and `gl_DrawIDARB`.
+        const DRAW_PARAMETERS = 0x4;
         /// Include unused global variables, constants and functions. By default the output will exclude
         /// global variables that are not used in the specified entrypoint (including indirect use),
         /// all constant declarations, and functions that use excluded global variables.
-        const INCLUDE_UNUSED_ITEMS = 0x4;
+        const INCLUDE_UNUSED_ITEMS = 0x10;
         /// Emit `PointSize` output builtin to vertex shaders, which is
         /// required for drawing with `PointList` topology.
         ///
         /// https://registry.khronos.org/OpenGL/specs/es/3.2/GLSL_ES_Specification_3.20.html#built-in-language-variables
         /// The variable gl_PointSize is intended for a shader to write the size of the point to be rasterized. It is measured in pixels.
         /// If gl_PointSize is not written to, its value is undefined in subsequent pipe stages.
-        const FORCE_POINT_SIZE = 0x10;
+        const FORCE_POINT_SIZE = 0x20;
     }
 }
 
@@ -388,6 +394,24 @@ impl IdGenerator {
     }
 }
 
+/// Assorted options needed for generting varyings.
+#[derive(Clone, Copy)]
+struct VaryingOptions {
+    output: bool,
+    targetting_webgl: bool,
+    draw_parameters: bool,
+}
+
+impl VaryingOptions {
+    const fn from_writer_options(options: &Options, output: bool) -> Self {
+        Self {
+            output,
+            targetting_webgl: options.version.is_webgl(),
+            draw_parameters: options.writer_flags.contains(WriterFlags::DRAW_PARAMETERS),
+        }
+    }
+}
+
 /// Helper wrapper used to get a name for a varying
 ///
 /// Varying have different naming schemes depending on their binding:
@@ -398,8 +422,7 @@ impl IdGenerator {
 struct VaryingName<'a> {
     binding: &'a crate::Binding,
     stage: ShaderStage,
-    output: bool,
-    targetting_webgl: bool,
+    options: VaryingOptions,
 }
 impl fmt::Display for VaryingName<'_> {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
@@ -411,7 +434,7 @@ impl fmt::Display for VaryingName<'_> {
                 write!(f, "_fs2p_location1",)
             }
             crate::Binding::Location { location, .. } => {
-                let prefix = match (self.stage, self.output) {
+                let prefix = match (self.stage, self.options.output) {
                     (ShaderStage::Compute, _) => unreachable!(),
                     // pipeline to vertex
                     (ShaderStage::Vertex, false) => "p2vs",
@@ -423,11 +446,7 @@ impl fmt::Display for VaryingName<'_> {
                 write!(f, "_{prefix}_location{location}",)
             }
             crate::Binding::BuiltIn(built_in) => {
-                write!(
-                    f,
-                    "{}",
-                    glsl_built_in(built_in, self.output, self.targetting_webgl)
-                )
+                write!(f, "{}", glsl_built_in(built_in, self.options))
             }
         }
     }
@@ -569,7 +588,11 @@ impl<'a, W: Write> Writer<'a, W> {
             keywords::RESERVED_KEYWORDS,
             &[],
             &[],
-            &["gl_"],
+            &[
+                "gl_",                     // all GL built-in variables
+                "_group",                  // all normal bindings
+                "_push_constant_binding_", // all push constant bindings
+            ],
             &mut names,
         );
 
@@ -621,7 +644,7 @@ impl<'a, W: Write> Writer<'a, W> {
         // writing the module saving some loops but some older versions (420 or less) required the
         // extensions to appear before being used, even though extensions are part of the
         // preprocessor not the processor ¯\_(ツ)_/¯
-        self.features.write(self.options.version, &mut self.out)?;
+        self.features.write(self.options, &mut self.out)?;
 
         // Write the additional extensions
         if self
@@ -652,6 +675,17 @@ impl<'a, W: Write> Writer<'a, W> {
             writeln!(self.out)?;
         }
 
+        if self.entry_point.stage == ShaderStage::Vertex
+            && !self
+                .options
+                .writer_flags
+                .contains(WriterFlags::DRAW_PARAMETERS)
+            && self.features.contains(Features::INSTANCE_INDEX)
+        {
+            writeln!(self.out, "uniform uint {FIRST_INSTANCE_BINDING};")?;
+            writeln!(self.out)?;
+        }
+
         // Enable early depth tests if needed
         if let Some(depth_test) = self.entry_point.early_depth_test {
             // If early depth test is supported for this version of GLSL
@@ -1422,7 +1456,10 @@ impl<'a, W: Write> Writer<'a, W> {
                             writeln!(
                                 self.out,
                                 "invariant {};",
-                                glsl_built_in(built_in, output, self.options.version.is_webgl())
+                                glsl_built_in(
+                                    built_in,
+                                    VaryingOptions::from_writer_options(self.options, output)
+                                )
                             )?;
                         }
                     }
@@ -1499,8 +1536,7 @@ impl<'a, W: Write> Writer<'a, W> {
                 second_blend_source,
             },
             stage: self.entry_point.stage,
-            output,
-            targetting_webgl: self.options.version.is_webgl(),
+            options: VaryingOptions::from_writer_options(self.options, output),
         };
         writeln!(self.out, " {vname};")?;
 
@@ -1661,8 +1697,7 @@ impl<'a, W: Write> Writer<'a, W> {
                             let varying_name = VaryingName {
                                 binding: member.binding.as_ref().unwrap(),
                                 stage,
-                                output: false,
-                                targetting_webgl: self.options.version.is_webgl(),
+                                options: VaryingOptions::from_writer_options(self.options, false),
                             };
                             if index != 0 {
                                 write!(self.out, ", ")?;
@@ -1675,8 +1710,7 @@ impl<'a, W: Write> Writer<'a, W> {
                         let varying_name = VaryingName {
                             binding: arg.binding.as_ref().unwrap(),
                             stage,
-                            output: false,
-                            targetting_webgl: self.options.version.is_webgl(),
+                            options: VaryingOptions::from_writer_options(self.options, false),
                         };
                         writeln!(self.out, "{varying_name};")?;
                     }
@@ -2170,8 +2204,10 @@ impl<'a, W: Write> Writer<'a, W> {
                                         let varying_name = VaryingName {
                                             binding: member.binding.as_ref().unwrap(),
                                             stage: ep.stage,
-                                            output: true,
-                                            targetting_webgl: self.options.version.is_webgl(),
+                                            options: VaryingOptions::from_writer_options(
+                                                self.options,
+                                                true,
+                                            ),
                                         };
                                         write!(self.out, "{varying_name} = ")?;
 
@@ -2195,8 +2231,10 @@ impl<'a, W: Write> Writer<'a, W> {
                                     let name = VaryingName {
                                         binding: result.binding.as_ref().unwrap(),
                                         stage: ep.stage,
-                                        output: true,
-                                        targetting_webgl: self.options.version.is_webgl(),
+                                        options: VaryingOptions::from_writer_options(
+                                            self.options,
+                                            true,
+                                        ),
                                     };
                                     write!(self.out, "{name} = ")?;
                                     self.write_expr(value, ctx)?;
@@ -4311,29 +4349,32 @@ const fn glsl_scalar(scalar: crate::Scalar) -> Result<ScalarString<'static>, Err
 }
 
 /// Helper function that returns the glsl variable name for a builtin
-const fn glsl_built_in(
-    built_in: crate::BuiltIn,
-    output: bool,
-    targetting_webgl: bool,
-) -> &'static str {
+const fn glsl_built_in(built_in: crate::BuiltIn, options: VaryingOptions) -> &'static str {
     use crate::BuiltIn as Bi;
 
     match built_in {
         Bi::Position { .. } => {
-            if output {
+            if options.output {
                 "gl_Position"
             } else {
                 "gl_FragCoord"
             }
         }
-        Bi::ViewIndex if targetting_webgl => "int(gl_ViewID_OVR)",
+        Bi::ViewIndex if options.targetting_webgl => "int(gl_ViewID_OVR)",
         Bi::ViewIndex => "gl_ViewIndex",
         // vertex
         Bi::BaseInstance => "uint(gl_BaseInstance)",
         Bi::BaseVertex => "uint(gl_BaseVertex)",
         Bi::ClipDistance => "gl_ClipDistance",
         Bi::CullDistance => "gl_CullDistance",
-        Bi::InstanceIndex => "uint(gl_InstanceID)",
+        Bi::InstanceIndex => {
+            if options.draw_parameters {
+                "(uint(gl_InstanceID) + uint(gl_BaseInstanceARB))"
+            } else {
+                // Must match FISRT_INSTANCE_BINDING
+                "(uint(gl_InstanceID) + naga_vs_first_instance)"
+            }
+        }
         Bi::PointSize => "gl_PointSize",
         Bi::VertexIndex => "uint(gl_VertexID)",
         // fragment
@@ -4343,7 +4384,7 @@ const fn glsl_built_in(
         Bi::PrimitiveIndex => "uint(gl_PrimitiveID)",
         Bi::SampleIndex => "gl_SampleID",
         Bi::SampleMask => {
-            if output {
+            if options.output {
                 "gl_SampleMask"
             } else {
                 "gl_SampleMaskIn"
diff --git a/naga/src/back/hlsl/writer.rs b/naga/src/back/hlsl/writer.rs
index 70ad516e55..24d54fc0e5 100644
--- a/naga/src/back/hlsl/writer.rs
+++ b/naga/src/back/hlsl/writer.rs
@@ -13,8 +13,8 @@ use std::{fmt, mem};
 const LOCATION_SEMANTIC: &str = "LOC";
 const SPECIAL_CBUF_TYPE: &str = "NagaConstants";
 const SPECIAL_CBUF_VAR: &str = "_NagaConstants";
-const SPECIAL_BASE_VERTEX: &str = "base_vertex";
-const SPECIAL_BASE_INSTANCE: &str = "base_instance";
+const SPECIAL_FIRST_VERTEX: &str = "first_vertex";
+const SPECIAL_FIRST_INSTANCE: &str = "first_instance";
 const SPECIAL_OTHER: &str = "other";
 
 pub(crate) const MODF_FUNCTION: &str = "naga_modf";
@@ -189,8 +189,8 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
         // Write special constants, if needed
         if let Some(ref bt) = self.options.special_constants_binding {
             writeln!(self.out, "struct {SPECIAL_CBUF_TYPE} {{")?;
-            writeln!(self.out, "{}int {};", back::INDENT, SPECIAL_BASE_VERTEX)?;
-            writeln!(self.out, "{}int {};", back::INDENT, SPECIAL_BASE_INSTANCE)?;
+            writeln!(self.out, "{}int {};", back::INDENT, SPECIAL_FIRST_VERTEX)?;
+            writeln!(self.out, "{}int {};", back::INDENT, SPECIAL_FIRST_INSTANCE)?;
             writeln!(self.out, "{}uint {};", back::INDENT, SPECIAL_OTHER)?;
             writeln!(self.out, "}};")?;
             write!(
@@ -2102,7 +2102,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
     ) -> BackendResult {
         use crate::Expression;
 
-        // Handle the special semantics for base vertex/instance
+        // Handle the special semantics of vertex_index/instance_index
         let ff_input = if self.options.special_constants_binding.is_some() {
             func_ctx.is_fixed_function_input(expr, module)
         } else {
@@ -2110,20 +2110,20 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
         };
         let closing_bracket = match ff_input {
             Some(crate::BuiltIn::VertexIndex) => {
-                write!(self.out, "({SPECIAL_CBUF_VAR}.{SPECIAL_BASE_VERTEX} + ")?;
+                write!(self.out, "({SPECIAL_CBUF_VAR}.{SPECIAL_FIRST_VERTEX} + ")?;
                 ")"
             }
             Some(crate::BuiltIn::InstanceIndex) => {
-                write!(self.out, "({SPECIAL_CBUF_VAR}.{SPECIAL_BASE_INSTANCE} + ",)?;
+                write!(self.out, "({SPECIAL_CBUF_VAR}.{SPECIAL_FIRST_INSTANCE} + ",)?;
                 ")"
             }
             Some(crate::BuiltIn::NumWorkGroups) => {
-                //Note: despite their names (`BASE_VERTEX` and `BASE_INSTANCE`),
+                // Note: despite their names (`FIRST_VERTEX` and `FIRST_INSTANCE`),
                 // in compute shaders the special constants contain the number
                 // of workgroups, which we are using here.
                 write!(
                     self.out,
-                    "uint3({SPECIAL_CBUF_VAR}.{SPECIAL_BASE_VERTEX}, {SPECIAL_CBUF_VAR}.{SPECIAL_BASE_INSTANCE}, {SPECIAL_CBUF_VAR}.{SPECIAL_OTHER})",
+                    "uint3({SPECIAL_CBUF_VAR}.{SPECIAL_FIRST_VERTEX}, {SPECIAL_CBUF_VAR}.{SPECIAL_FIRST_INSTANCE}, {SPECIAL_CBUF_VAR}.{SPECIAL_OTHER})",
                 )?;
                 return Ok(());
             }
diff --git a/naga/tests/in/push-constants.wgsl b/naga/tests/in/push-constants.wgsl
index b3dc515230..dbe81a0295 100644
--- a/naga/tests/in/push-constants.wgsl
+++ b/naga/tests/in/push-constants.wgsl
@@ -10,9 +10,10 @@ struct FragmentIn {
 @vertex
 fn vert_main(
   @location(0) pos : vec2<f32>,
+  @builtin(instance_index) ii: u32,
   @builtin(vertex_index) vi: u32,
 ) -> @builtin(position) vec4<f32> {
-    return vec4<f32>(f32(vi) * pc.multiplier * pos, 0.0, 1.0);
+    return vec4<f32>(f32(ii) * f32(vi) * pc.multiplier * pos, 0.0, 1.0);
 }
 
 @fragment
diff --git a/naga/tests/out/glsl/push-constants.vert_main.Vertex.glsl b/naga/tests/out/glsl/push-constants.vert_main.Vertex.glsl
index 4519dc4c6c..53b218a946 100644
--- a/naga/tests/out/glsl/push-constants.vert_main.Vertex.glsl
+++ b/naga/tests/out/glsl/push-constants.vert_main.Vertex.glsl
@@ -3,6 +3,8 @@
 precision highp float;
 precision highp int;
 
+uniform uint naga_vs_first_instance;
+
 struct PushConstants {
     float multiplier;
 };
@@ -15,9 +17,10 @@ layout(location = 0) in vec2 _p2vs_location0;
 
 void main() {
     vec2 pos = _p2vs_location0;
+    uint ii = (uint(gl_InstanceID) + naga_vs_first_instance);
     uint vi = uint(gl_VertexID);
-    float _e5 = _push_constant_binding_vs.multiplier;
-    gl_Position = vec4(((float(vi) * _e5) * pos), 0.0, 1.0);
+    float _e8 = _push_constant_binding_vs.multiplier;
+    gl_Position = vec4((((float(ii) * float(vi)) * _e8) * pos), 0.0, 1.0);
     return;
 }
 
diff --git a/naga/tests/out/hlsl/interface.hlsl b/naga/tests/out/hlsl/interface.hlsl
index 3784864edf..bbf330d4d6 100644
--- a/naga/tests/out/hlsl/interface.hlsl
+++ b/naga/tests/out/hlsl/interface.hlsl
@@ -1,6 +1,6 @@
 struct NagaConstants {
-    int base_vertex;
-    int base_instance;
+    int first_vertex;
+    int first_instance;
     uint other;
 };
 ConstantBuffer<NagaConstants> _NagaConstants: register(b0, space1);
@@ -48,7 +48,7 @@ VertexOutput ConstructVertexOutput(float4 arg0, float arg1) {
 
 VertexOutput_vertex vertex(uint vertex_index : SV_VertexID, uint instance_index : SV_InstanceID, uint color : LOC10)
 {
-    uint tmp = (((_NagaConstants.base_vertex + vertex_index) + (_NagaConstants.base_instance + instance_index)) + color);
+    uint tmp = (((_NagaConstants.first_vertex + vertex_index) + (_NagaConstants.first_instance + instance_index)) + color);
     const VertexOutput vertexoutput = ConstructVertexOutput((1.0).xxxx, float(tmp));
     const VertexOutput_vertex vertexoutput_1 = { vertexoutput._varying, vertexoutput.position };
     return vertexoutput_1;
@@ -81,7 +81,7 @@ void compute(uint3 global_id : SV_DispatchThreadID, uint3 local_id : SV_GroupThr
         output = (uint[1])0;
     }
     GroupMemoryBarrierWithGroupSync();
-    output[0] = ((((global_id.x + local_id.x) + local_index) + wg_id.x) + uint3(_NagaConstants.base_vertex, _NagaConstants.base_instance, _NagaConstants.other).x);
+    output[0] = ((((global_id.x + local_id.x) + local_index) + wg_id.x) + uint3(_NagaConstants.first_vertex, _NagaConstants.first_instance, _NagaConstants.other).x);
     return;
 }
 
@@ -90,5 +90,5 @@ precise float4 vertex_two_structs(Input1_ in1_, Input2_ in2_) : SV_Position
     uint index = 2u;
 
     uint _expr8 = index;
-    return float4(float((_NagaConstants.base_vertex + in1_.index)), float((_NagaConstants.base_instance + in2_.index)), float(_expr8), 0.0);
+    return float4(float((_NagaConstants.first_vertex + in1_.index)), float((_NagaConstants.first_instance + in2_.index)), float(_expr8), 0.0);
 }
diff --git a/naga/tests/out/hlsl/push-constants.hlsl b/naga/tests/out/hlsl/push-constants.hlsl
index 22b4b6acdf..187eb5b2fc 100644
--- a/naga/tests/out/hlsl/push-constants.hlsl
+++ b/naga/tests/out/hlsl/push-constants.hlsl
@@ -1,6 +1,6 @@
 struct NagaConstants {
-    int base_vertex;
-    int base_instance;
+    int first_vertex;
+    int first_instance;
     uint other;
 };
 ConstantBuffer<NagaConstants> _NagaConstants: register(b0, space1);
@@ -19,10 +19,10 @@ struct FragmentInput_main {
     float4 color : LOC0;
 };
 
-float4 vert_main(float2 pos : LOC0, uint vi : SV_VertexID) : SV_Position
+float4 vert_main(float2 pos : LOC0, uint ii : SV_InstanceID, uint vi : SV_VertexID) : SV_Position
 {
-    float _expr5 = pc.multiplier;
-    return float4(((float((_NagaConstants.base_vertex + vi)) * _expr5) * pos), 0.0, 1.0);
+    float _expr8 = pc.multiplier;
+    return float4((((float((_NagaConstants.first_instance + ii)) * float((_NagaConstants.first_vertex + vi))) * _expr8) * pos), 0.0, 1.0);
 }
 
 float4 main(FragmentInput_main fragmentinput_main) : SV_Target0
diff --git a/naga/tests/out/hlsl/skybox.hlsl b/naga/tests/out/hlsl/skybox.hlsl
index 7a6a36b87f..8dc97b1e8e 100644
--- a/naga/tests/out/hlsl/skybox.hlsl
+++ b/naga/tests/out/hlsl/skybox.hlsl
@@ -1,6 +1,6 @@
 struct NagaConstants {
-    int base_vertex;
-    int base_instance;
+    int first_vertex;
+    int first_instance;
     uint other;
 };
 ConstantBuffer<NagaConstants> _NagaConstants: register(b1);
@@ -41,8 +41,8 @@ VertexOutput_vs_main vs_main(uint vertex_index : SV_VertexID)
     int tmp1_ = (int)0;
     int tmp2_ = (int)0;
 
-    tmp1_ = (int((_NagaConstants.base_vertex + vertex_index)) / 2);
-    tmp2_ = (int((_NagaConstants.base_vertex + vertex_index)) & 1);
+    tmp1_ = (int((_NagaConstants.first_vertex + vertex_index)) / 2);
+    tmp2_ = (int((_NagaConstants.first_vertex + vertex_index)) & 1);
     int _expr9 = tmp1_;
     int _expr15 = tmp2_;
     float4 pos = float4(((float(_expr9) * 4.0) - 1.0), ((float(_expr15) * 4.0) - 1.0), 0.0, 1.0);
diff --git a/tests/tests/mem_leaks.rs b/tests/tests/mem_leaks.rs
index 42cc12b6e2..d5a57711ab 100644
--- a/tests/tests/mem_leaks.rs
+++ b/tests/tests/mem_leaks.rs
@@ -95,7 +95,7 @@ fn draw_test_with_reports(
             layout: Some(&ppl),
             vertex: wgpu::VertexState {
                 buffers: &[],
-                entry_point: "vs_main",
+                entry_point: "vs_main_builtin",
                 module: &shader,
             },
             primitive: wgpu::PrimitiveState::default(),
@@ -276,7 +276,7 @@ fn draw_test_with_reports(
     target_os = "emscripten",
     feature = "webgl"
 ))]
-#[wgpu_macros::gpu_test]
+#[wgpu_test::gpu_test]
 static SIMPLE_DRAW_CHECK_MEM_LEAKS: wgpu_test::GpuTestConfiguration =
     wgpu_test::GpuTestConfiguration::new()
         .parameters(
diff --git a/tests/tests/vertex_indices/draw.vert.wgsl b/tests/tests/vertex_indices/draw.vert.wgsl
index 6fc8e3746f..a5f89f787a 100644
--- a/tests/tests/vertex_indices/draw.vert.wgsl
+++ b/tests/tests/vertex_indices/draw.vert.wgsl
@@ -2,7 +2,16 @@
 var<storage, read_write> indices: array<u32>; // this is used as both input and output for convenience
 
 @vertex
-fn vs_main(@builtin(instance_index) instance: u32, @builtin(vertex_index) index: u32) -> @builtin(position) vec4<f32> {
+fn vs_main_builtin(@builtin(instance_index) instance: u32, @builtin(vertex_index) index: u32) -> @builtin(position) vec4<f32> {
+    return vs_inner(instance, index);
+}
+
+@vertex
+fn vs_main_buffers(@location(0) instance: u32, @location(1) index: u32) -> @builtin(position) vec4<f32> {
+    return vs_inner(instance, index);
+}
+
+fn vs_inner(instance: u32, index: u32) -> vec4<f32> {
     let idx = instance * 3u + index;
     indices[idx] = idx;
     return vec4<f32>(0.0, 0.0, 0.0, 1.0);
diff --git a/tests/tests/vertex_indices/mod.rs b/tests/tests/vertex_indices/mod.rs
index 4fe8395eaa..6dde2915c7 100644
--- a/tests/tests/vertex_indices/mod.rs
+++ b/tests/tests/vertex_indices/mod.rs
@@ -1,14 +1,237 @@
-use std::num::NonZeroU64;
+//! Tests that vertex buffers, vertex indices, and instance indices are properly handled.
+//!
+//! We need tests for these as the backends use various schemes to work around the lack
+//! of support for things like `gl_BaseInstance` in shaders.
 
-use wgpu::util::DeviceExt;
+use std::{num::NonZeroU64, ops::Range};
 
-use wgpu_test::{gpu_test, FailureCase, GpuTestConfiguration, TestParameters, TestingContext};
+use wgpu::util::{BufferInitDescriptor, DeviceExt};
+
+use wgpu_test::{gpu_test, GpuTestConfiguration, TestParameters, TestingContext};
+
+/// Generic struct representing a draw call
+struct Draw {
+    vertex: Range<u32>,
+    instance: Range<u32>,
+    /// If present, is an indexed call
+    base_vertex: Option<i32>,
+}
+
+impl Draw {
+    /// Directly execute the draw call
+    fn execute(&self, rpass: &mut wgpu::RenderPass<'_>) {
+        if let Some(base_vertex) = self.base_vertex {
+            rpass.draw_indexed(self.vertex.clone(), base_vertex, self.instance.clone());
+        } else {
+            rpass.draw(self.vertex.clone(), self.instance.clone());
+        }
+    }
+
+    /// Add the draw call to the given indirect buffer
+    fn add_to_buffer(&self, bytes: &mut Vec<u8>, features: wgpu::Features) {
+        // The behavior of non-zero first_instance in indirect draw calls in currently undefined if INDIRECT_FIRST_INSTANCE is not supported.
+        let supports_first_instance = features.contains(wgpu::Features::INDIRECT_FIRST_INSTANCE);
+        let first_instance = if supports_first_instance {
+            self.instance.start
+        } else {
+            0
+        };
+
+        if let Some(base_vertex) = self.base_vertex {
+            bytes.extend_from_slice(
+                wgpu::util::DrawIndexedIndirectArgs {
+                    index_count: self.vertex.end - self.vertex.start,
+                    instance_count: self.instance.end - self.instance.start,
+                    base_vertex,
+                    first_index: self.vertex.start,
+                    first_instance,
+                }
+                .as_bytes(),
+            )
+        } else {
+            bytes.extend_from_slice(
+                wgpu::util::DrawIndirectArgs {
+                    vertex_count: self.vertex.end - self.vertex.start,
+                    instance_count: self.instance.end - self.instance.start,
+                    first_vertex: self.vertex.start,
+                    first_instance,
+                }
+                .as_bytes(),
+            )
+        }
+    }
+
+    /// Execute the draw call from the given indirect buffer
+    fn execute_indirect<'rpass>(
+        &self,
+        rpass: &mut wgpu::RenderPass<'rpass>,
+        indirect: &'rpass wgpu::Buffer,
+        offset: &mut u64,
+    ) {
+        if self.base_vertex.is_some() {
+            rpass.draw_indexed_indirect(indirect, *offset);
+            *offset += 20;
+        } else {
+            rpass.draw_indirect(indirect, *offset);
+            *offset += 16;
+        }
+    }
+}
+
+#[derive(Debug, Copy, Clone)]
+enum TestCase {
+    /// A single draw call with 6 vertices
+    Draw,
+    /// Two draw calls of 0..3 and 3..6 verts
+    DrawNonZeroFirstVertex,
+    /// A single draw call with 6 vertices and a vertex offset of 3
+    DrawBaseVertex,
+    /// A single draw call with 3 vertices and 2 instances
+    DrawInstanced,
+    /// Two draw calls with 3 vertices and 0..1 and 1..2 instances.
+    DrawNonZeroFirstInstance,
+}
+
+impl TestCase {
+    const ARRAY: [Self; 5] = [
+        Self::Draw,
+        Self::DrawNonZeroFirstVertex,
+        Self::DrawBaseVertex,
+        Self::DrawInstanced,
+        Self::DrawNonZeroFirstInstance,
+    ];
+
+    // Get the draw calls for this test case
+    fn draws(&self) -> &'static [Draw] {
+        match self {
+            TestCase::Draw => &[Draw {
+                vertex: 0..6,
+                instance: 0..1,
+                base_vertex: None,
+            }],
+            TestCase::DrawNonZeroFirstVertex => &[
+                Draw {
+                    vertex: 0..3,
+                    instance: 0..1,
+                    base_vertex: None,
+                },
+                Draw {
+                    vertex: 3..6,
+                    instance: 0..1,
+                    base_vertex: None,
+                },
+            ],
+            TestCase::DrawBaseVertex => &[Draw {
+                vertex: 0..6,
+                instance: 0..1,
+                base_vertex: Some(3),
+            }],
+            TestCase::DrawInstanced => &[Draw {
+                vertex: 0..3,
+                instance: 0..2,
+                base_vertex: None,
+            }],
+            TestCase::DrawNonZeroFirstInstance => &[
+                Draw {
+                    vertex: 0..3,
+                    instance: 0..1,
+                    base_vertex: None,
+                },
+                Draw {
+                    vertex: 0..3,
+                    instance: 1..2,
+                    base_vertex: None,
+                },
+            ],
+        }
+    }
+}
+
+#[derive(Debug, Copy, Clone)]
+enum IdSource {
+    /// Use buffers to load the vertex and instance index
+    Buffers,
+    /// Use builtins to load the vertex and instance index
+    Builtins,
+}
+
+impl IdSource {
+    const ARRAY: [Self; 2] = [Self::Buffers, Self::Builtins];
+}
+
+#[derive(Debug, Copy, Clone)]
+enum DrawCallKind {
+    Direct,
+    Indirect,
+}
+
+impl DrawCallKind {
+    const ARRAY: [Self; 2] = [Self::Direct, Self::Indirect];
+}
+
+struct Test {
+    case: TestCase,
+    id_source: IdSource,
+    draw_call_kind: DrawCallKind,
+}
+
+impl Test {
+    /// Get the expected result from this test, taking into account
+    /// the various features and capabilities that may be missing.
+    fn expectation(&self, ctx: &TestingContext) -> &'static [u32] {
+        let is_indirect = matches!(self.draw_call_kind, DrawCallKind::Indirect);
+
+        // Both of these failure modes require indirect rendering
+
+        // If this is false, the first instance will be ignored.
+        let non_zero_first_instance_supported = ctx
+            .adapter
+            .features()
+            .contains(wgpu::Features::INDIRECT_FIRST_INSTANCE)
+            || !is_indirect;
+
+        // If this is false, it won't be ignored, but it won't show up in the shader
+        //
+        // If the IdSource is buffers, this doesn't apply
+        let first_vert_instance_supported = ctx.adapter_downlevel_capabilities.flags.contains(
+            wgpu::DownlevelFlags::VERTEX_AND_INSTANCE_INDEX_RESPECTS_RESPECTIVE_FIRST_VALUE_IN_INDIRECT_DRAW,
+        ) || matches!(self.id_source, IdSource::Buffers)
+            || !is_indirect;
+
+        match self.case {
+            TestCase::DrawBaseVertex => {
+                if !first_vert_instance_supported {
+                    return &[0, 1, 2, 3, 4, 5];
+                }
+
+                &[0, 0, 0, 3, 4, 5, 6, 7, 8]
+            }
+            TestCase::Draw | TestCase::DrawInstanced => &[0, 1, 2, 3, 4, 5],
+            TestCase::DrawNonZeroFirstVertex => {
+                if !first_vert_instance_supported {
+                    return &[0, 1, 2, 0, 0, 0];
+                }
+
+                &[0, 1, 2, 3, 4, 5]
+            }
+            TestCase::DrawNonZeroFirstInstance => {
+                if !first_vert_instance_supported || !non_zero_first_instance_supported {
+                    return &[0, 1, 2, 0, 0, 0];
+                }
+
+                &[0, 1, 2, 3, 4, 5]
+            }
+        }
+    }
+}
+
+fn vertex_index_common(ctx: TestingContext) {
+    let identity_buffer = ctx.device.create_buffer_init(&BufferInitDescriptor {
+        label: Some("identity buffer"),
+        contents: bytemuck::cast_slice(&[0u32, 1, 2, 3, 4, 5, 6, 7, 8]),
+        usage: wgpu::BufferUsages::VERTEX | wgpu::BufferUsages::INDEX,
+    });
 
-fn pulling_common(
-    ctx: TestingContext,
-    expected: &[u32],
-    function: impl FnOnce(&mut wgpu::RenderPass<'_>),
-) {
     let shader = ctx
         .device
         .create_shader_module(wgpu::include_wgsl!("draw.vert.wgsl"));
@@ -29,30 +252,6 @@ fn pulling_common(
             }],
         });
 
-    let buffer_size = 4 * expected.len() as u64;
-    let cpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
-        label: None,
-        size: buffer_size,
-        usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
-        mapped_at_creation: false,
-    });
-
-    let gpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
-        label: None,
-        size: buffer_size,
-        usage: wgpu::BufferUsages::COPY_SRC | wgpu::BufferUsages::STORAGE,
-        mapped_at_creation: false,
-    });
-
-    let bg = ctx.device.create_bind_group(&wgpu::BindGroupDescriptor {
-        label: None,
-        layout: &bgl,
-        entries: &[wgpu::BindGroupEntry {
-            binding: 0,
-            resource: gpu_buffer.as_entire_binding(),
-        }],
-    });
-
     let ppl = ctx
         .device
         .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
@@ -61,30 +260,43 @@ fn pulling_common(
             push_constant_ranges: &[],
         });
 
-    let pipeline = ctx
-        .device
-        .create_render_pipeline(&wgpu::RenderPipelineDescriptor {
-            label: None,
-            layout: Some(&ppl),
-            vertex: wgpu::VertexState {
-                buffers: &[],
-                entry_point: "vs_main",
-                module: &shader,
-            },
-            primitive: wgpu::PrimitiveState::default(),
-            depth_stencil: None,
-            multisample: wgpu::MultisampleState::default(),
-            fragment: Some(wgpu::FragmentState {
-                entry_point: "fs_main",
-                module: &shader,
-                targets: &[Some(wgpu::ColorTargetState {
-                    format: wgpu::TextureFormat::Rgba8Unorm,
-                    blend: None,
-                    write_mask: wgpu::ColorWrites::ALL,
-                })],
-            }),
-            multiview: None,
-        });
+    let mut pipeline_desc = wgpu::RenderPipelineDescriptor {
+        label: None,
+        layout: Some(&ppl),
+        vertex: wgpu::VertexState {
+            buffers: &[],
+            entry_point: "vs_main_builtin",
+            module: &shader,
+        },
+        primitive: wgpu::PrimitiveState::default(),
+        depth_stencil: None,
+        multisample: wgpu::MultisampleState::default(),
+        fragment: Some(wgpu::FragmentState {
+            entry_point: "fs_main",
+            module: &shader,
+            targets: &[Some(wgpu::ColorTargetState {
+                format: wgpu::TextureFormat::Rgba8Unorm,
+                blend: None,
+                write_mask: wgpu::ColorWrites::ALL,
+            })],
+        }),
+        multiview: None,
+    };
+    let builtin_pipeline = ctx.device.create_render_pipeline(&pipeline_desc);
+    pipeline_desc.vertex.entry_point = "vs_main_buffers";
+    pipeline_desc.vertex.buffers = &[
+        wgpu::VertexBufferLayout {
+            array_stride: 4,
+            step_mode: wgpu::VertexStepMode::Instance,
+            attributes: &wgpu::vertex_attr_array![0 => Uint32],
+        },
+        wgpu::VertexBufferLayout {
+            array_stride: 4,
+            step_mode: wgpu::VertexStepMode::Vertex,
+            attributes: &wgpu::vertex_attr_array![1 => Uint32],
+        },
+    ];
+    let buffer_pipeline = ctx.device.create_render_pipeline(&pipeline_desc);
 
     let dummy = ctx
         .device
@@ -108,91 +320,146 @@ fn pulling_common(
         )
         .create_view(&wgpu::TextureViewDescriptor::default());
 
-    let mut encoder = ctx
-        .device
-        .create_command_encoder(&wgpu::CommandEncoderDescriptor::default());
+    let mut tests = Vec::with_capacity(5 * 2 * 2);
+    for case in TestCase::ARRAY {
+        for id_source in IdSource::ARRAY {
+            for draw_call_kind in DrawCallKind::ARRAY {
+                tests.push(Test {
+                    case,
+                    id_source,
+                    draw_call_kind,
+                })
+            }
+        }
+    }
 
-    let mut rpass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
-        label: None,
-        color_attachments: &[Some(wgpu::RenderPassColorAttachment {
-            ops: wgpu::Operations::default(),
-            resolve_target: None,
-            view: &dummy,
-        })],
-        depth_stencil_attachment: None,
-        timestamp_writes: None,
-        occlusion_query_set: None,
-    });
+    let features = ctx.adapter.features();
 
-    rpass.set_pipeline(&pipeline);
-    rpass.set_bind_group(0, &bg, &[]);
-    function(&mut rpass);
+    let mut failed = false;
+    for test in tests {
+        let pipeline = match test.id_source {
+            IdSource::Buffers => &buffer_pipeline,
+            IdSource::Builtins => &builtin_pipeline,
+        };
 
-    drop(rpass);
+        let expected = test.expectation(&ctx);
 
-    encoder.copy_buffer_to_buffer(&gpu_buffer, 0, &cpu_buffer, 0, buffer_size);
+        let buffer_size = 4 * expected.len() as u64;
+        let cpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
+            label: None,
+            size: buffer_size,
+            usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
+            mapped_at_creation: false,
+        });
 
-    ctx.queue.submit(Some(encoder.finish()));
-    let slice = cpu_buffer.slice(..);
-    slice.map_async(wgpu::MapMode::Read, |_| ());
-    ctx.device.poll(wgpu::Maintain::Wait);
-    let data: Vec<u32> = bytemuck::cast_slice(&slice.get_mapped_range()).to_vec();
+        let gpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
+            label: None,
+            size: buffer_size,
+            usage: wgpu::BufferUsages::COPY_SRC | wgpu::BufferUsages::STORAGE,
+            mapped_at_creation: false,
+        });
 
-    assert_eq!(data, expected);
-}
+        let bg = ctx.device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: None,
+            layout: &bgl,
+            entries: &[wgpu::BindGroupEntry {
+                binding: 0,
+                resource: gpu_buffer.as_entire_binding(),
+            }],
+        });
 
-#[gpu_test]
-static DRAW: GpuTestConfiguration = GpuTestConfiguration::new()
-    .parameters(
-        TestParameters::default()
-            .test_features_limits()
-            .features(wgpu::Features::VERTEX_WRITABLE_STORAGE),
-    )
-    .run_sync(|ctx| {
-        pulling_common(ctx, &[0, 1, 2, 3, 4, 5], |cmb| {
-            cmb.draw(0..6, 0..1);
-        })
-    });
+        let mut encoder1 = ctx
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor::default());
 
-#[gpu_test]
-static DRAW_VERTEX: GpuTestConfiguration = GpuTestConfiguration::new()
-    .parameters(
-        TestParameters::default()
-            .test_features_limits()
-            .features(wgpu::Features::VERTEX_WRITABLE_STORAGE),
-    )
-    .run_sync(|ctx| {
-        pulling_common(ctx, &[0, 1, 2, 3, 4, 5], |cmb| {
-            cmb.draw(0..3, 0..1);
-            cmb.draw(3..6, 0..1);
-        })
-    });
+        let indirect_buffer;
+        let mut rpass = encoder1.begin_render_pass(&wgpu::RenderPassDescriptor {
+            label: None,
+            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                ops: wgpu::Operations::default(),
+                resolve_target: None,
+                view: &dummy,
+            })],
+            depth_stencil_attachment: None,
+            timestamp_writes: None,
+            occlusion_query_set: None,
+        });
 
-#[gpu_test]
-static DRAW_INSTANCED: GpuTestConfiguration = GpuTestConfiguration::new()
-    .parameters(
-        TestParameters::default()
-            .test_features_limits()
-            .features(wgpu::Features::VERTEX_WRITABLE_STORAGE),
-    )
-    .run_sync(|ctx| {
-        pulling_common(ctx, &[0, 1, 2, 3, 4, 5], |cmb| {
-            cmb.draw(0..3, 0..2);
-        })
-    });
+        rpass.set_vertex_buffer(0, identity_buffer.slice(..));
+        rpass.set_vertex_buffer(1, identity_buffer.slice(..));
+        rpass.set_index_buffer(identity_buffer.slice(..), wgpu::IndexFormat::Uint32);
+        rpass.set_pipeline(pipeline);
+        rpass.set_bind_group(0, &bg, &[]);
+
+        let draws = test.case.draws();
+
+        match test.draw_call_kind {
+            DrawCallKind::Direct => {
+                for draw in draws {
+                    draw.execute(&mut rpass);
+                }
+            }
+            DrawCallKind::Indirect => {
+                let mut indirect_bytes = Vec::new();
+                for draw in draws {
+                    draw.add_to_buffer(&mut indirect_bytes, features);
+                }
+                indirect_buffer = ctx.device.create_buffer_init(&BufferInitDescriptor {
+                    label: Some("indirect"),
+                    contents: &indirect_bytes,
+                    usage: wgpu::BufferUsages::INDIRECT,
+                });
+                let mut offset = 0;
+                for draw in draws {
+                    draw.execute_indirect(&mut rpass, &indirect_buffer, &mut offset);
+                }
+            }
+        }
+
+        drop(rpass);
+
+        let mut encoder2 = ctx
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor::default());
+
+        encoder2.copy_buffer_to_buffer(&gpu_buffer, 0, &cpu_buffer, 0, buffer_size);
+
+        // See https://github.com/gfx-rs/wgpu/issues/4732 for why this is split between two submissions
+        // with a hard wait in between.
+        ctx.queue.submit([encoder1.finish()]);
+        ctx.device.poll(wgpu::Maintain::Wait);
+        ctx.queue.submit([encoder2.finish()]);
+        let slice = cpu_buffer.slice(..);
+        slice.map_async(wgpu::MapMode::Read, |_| ());
+        ctx.device.poll(wgpu::Maintain::Wait);
+        let data: Vec<u32> = bytemuck::cast_slice(&slice.get_mapped_range()).to_vec();
+
+        if data != expected {
+            eprintln!(
+                "Failed: Got: {:?} Expected: {:?} - Case {:?} getting indices from {:?} using {:?} draw calls",
+                data,
+                expected,
+                test.case,
+                test.id_source,
+                test.draw_call_kind
+            );
+            failed = true;
+        } else {
+            eprintln!(
+                "Passed: Case {:?} getting indices from {:?} using {:?} draw calls",
+                test.case, test.id_source, test.draw_call_kind
+            );
+        }
+    }
+
+    assert!(!failed);
+}
 
 #[gpu_test]
-static DRAW_INSTANCED_OFFSET: GpuTestConfiguration = GpuTestConfiguration::new()
+static VERTEX_INDICES: GpuTestConfiguration = GpuTestConfiguration::new()
     .parameters(
         TestParameters::default()
             .test_features_limits()
-            .features(wgpu::Features::VERTEX_WRITABLE_STORAGE)
-            // https://github.com/gfx-rs/wgpu/issues/4276
-            .expect_fail(FailureCase::backend(wgpu::Backends::GL)),
+            .features(wgpu::Features::VERTEX_WRITABLE_STORAGE),
     )
-    .run_sync(|ctx| {
-        pulling_common(ctx, &[0, 1, 2, 3, 4, 5], |cmb| {
-            cmb.draw(0..3, 0..1);
-            cmb.draw(0..3, 1..2);
-        })
-    });
+    .run_sync(vertex_index_common);
diff --git a/wgpu-core/src/device/resource.rs b/wgpu-core/src/device/resource.rs
index 035b2ea554..8fc4a9abb3 100644
--- a/wgpu-core/src/device/resource.rs
+++ b/wgpu-core/src/device/resource.rs
@@ -2389,7 +2389,7 @@ impl<A: HalApi> Device<A> {
             .collect::<Vec<_>>();
         let hal_desc = hal::PipelineLayoutDescriptor {
             label: desc.label.to_hal(self.instance_flags),
-            flags: hal::PipelineLayoutFlags::BASE_VERTEX_INSTANCE,
+            flags: hal::PipelineLayoutFlags::FIRST_VERTEX_INSTANCE,
             bind_group_layouts: &bgl_vec,
             push_constant_ranges: desc.push_constant_ranges.as_ref(),
         };
diff --git a/wgpu-hal/src/dx11/command.rs b/wgpu-hal/src/dx11/command.rs
index 3bbdf0a7ee..e5cc92bff9 100644
--- a/wgpu-hal/src/dx11/command.rs
+++ b/wgpu-hal/src/dx11/command.rs
@@ -187,9 +187,9 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
 
     unsafe fn draw(
         &mut self,
-        start_vertex: u32,
+        first_vertex: u32,
         vertex_count: u32,
-        start_instance: u32,
+        first_instance: u32,
         instance_count: u32,
     ) {
         todo!()
@@ -197,10 +197,10 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
 
     unsafe fn draw_indexed(
         &mut self,
-        start_index: u32,
+        first_index: u32,
         index_count: u32,
         base_vertex: i32,
-        start_instance: u32,
+        first_instance: u32,
         instance_count: u32,
     ) {
         todo!()
diff --git a/wgpu-hal/src/dx12/adapter.rs b/wgpu-hal/src/dx12/adapter.rs
index 9d66fd2653..04295508a3 100644
--- a/wgpu-hal/src/dx12/adapter.rs
+++ b/wgpu-hal/src/dx12/adapter.rs
@@ -298,6 +298,11 @@ impl super::Adapter {
 
         let base = wgt::Limits::default();
 
+        let mut downlevel = wgt::DownlevelCapabilities::default();
+        // https://github.com/gfx-rs/wgpu/issues/2471
+        downlevel.flags -=
+            wgt::DownlevelFlags::VERTEX_AND_INSTANCE_INDEX_RESPECTS_RESPECTIVE_FIRST_VALUE_IN_INDIRECT_DRAW;
+
         Some(crate::ExposedAdapter {
             adapter: super::Adapter {
                 raw: adapter,
@@ -392,7 +397,7 @@ impl super::Adapter {
                     )
                     .unwrap(),
                 },
-                downlevel: wgt::DownlevelCapabilities::default(),
+                downlevel,
             },
         })
     }
diff --git a/wgpu-hal/src/dx12/command.rs b/wgpu-hal/src/dx12/command.rs
index 2ea6ad4ab2..5bbd0d4ba5 100644
--- a/wgpu-hal/src/dx12/command.rs
+++ b/wgpu-hal/src/dx12/command.rs
@@ -85,7 +85,7 @@ impl super::CommandEncoder {
         self.pass.clear();
     }
 
-    unsafe fn prepare_draw(&mut self, base_vertex: i32, base_instance: u32) {
+    unsafe fn prepare_draw(&mut self, first_vertex: i32, first_instance: u32) {
         while self.pass.dirty_vertex_buffers != 0 {
             let list = self.list.as_ref().unwrap();
             let index = self.pass.dirty_vertex_buffers.trailing_zeros();
@@ -101,18 +101,18 @@ impl super::CommandEncoder {
         if let Some(root_index) = self.pass.layout.special_constants_root_index {
             let needs_update = match self.pass.root_elements[root_index as usize] {
                 super::RootElement::SpecialConstantBuffer {
-                    base_vertex: other_vertex,
-                    base_instance: other_instance,
+                    first_vertex: other_vertex,
+                    first_instance: other_instance,
                     other: _,
-                } => base_vertex != other_vertex || base_instance != other_instance,
+                } => first_vertex != other_vertex || first_instance != other_instance,
                 _ => true,
             };
             if needs_update {
                 self.pass.dirty_root_elements |= 1 << root_index;
                 self.pass.root_elements[root_index as usize] =
                     super::RootElement::SpecialConstantBuffer {
-                        base_vertex,
-                        base_instance,
+                        first_vertex,
+                        first_instance,
                         other: 0,
                     };
             }
@@ -124,18 +124,18 @@ impl super::CommandEncoder {
         if let Some(root_index) = self.pass.layout.special_constants_root_index {
             let needs_update = match self.pass.root_elements[root_index as usize] {
                 super::RootElement::SpecialConstantBuffer {
-                    base_vertex,
-                    base_instance,
+                    first_vertex,
+                    first_instance,
                     other,
-                } => [base_vertex as u32, base_instance, other] != count,
+                } => [first_vertex as u32, first_instance, other] != count,
                 _ => true,
             };
             if needs_update {
                 self.pass.dirty_root_elements |= 1 << root_index;
                 self.pass.root_elements[root_index as usize] =
                     super::RootElement::SpecialConstantBuffer {
-                        base_vertex: count[0] as i32,
-                        base_instance: count[1],
+                        first_vertex: count[0] as i32,
+                        first_instance: count[1],
                         other: count[2],
                     };
             }
@@ -168,17 +168,17 @@ impl super::CommandEncoder {
                     }
                 }
                 super::RootElement::SpecialConstantBuffer {
-                    base_vertex,
-                    base_instance,
+                    first_vertex,
+                    first_instance,
                     other,
                 } => match self.pass.kind {
                     Pk::Render => {
-                        list.set_graphics_root_constant(index, base_vertex as u32, 0);
-                        list.set_graphics_root_constant(index, base_instance, 1);
+                        list.set_graphics_root_constant(index, first_vertex as u32, 0);
+                        list.set_graphics_root_constant(index, first_instance, 1);
                     }
                     Pk::Compute => {
-                        list.set_compute_root_constant(index, base_vertex as u32, 0);
-                        list.set_compute_root_constant(index, base_instance, 1);
+                        list.set_compute_root_constant(index, first_vertex as u32, 0);
+                        list.set_compute_root_constant(index, first_instance, 1);
                         list.set_compute_root_constant(index, other, 2);
                     }
                     Pk::Transfer => (),
@@ -220,8 +220,8 @@ impl super::CommandEncoder {
         if let Some(root_index) = layout.special_constants_root_index {
             self.pass.root_elements[root_index as usize] =
                 super::RootElement::SpecialConstantBuffer {
-                    base_vertex: 0,
-                    base_instance: 0,
+                    first_vertex: 0,
+                    first_instance: 0,
                     other: 0,
                 };
         }
@@ -1031,34 +1031,34 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
 
     unsafe fn draw(
         &mut self,
-        start_vertex: u32,
+        first_vertex: u32,
         vertex_count: u32,
-        start_instance: u32,
+        first_instance: u32,
         instance_count: u32,
     ) {
-        unsafe { self.prepare_draw(start_vertex as i32, start_instance) };
+        unsafe { self.prepare_draw(first_vertex as i32, first_instance) };
         self.list.as_ref().unwrap().draw(
             vertex_count,
             instance_count,
-            start_vertex,
-            start_instance,
+            first_vertex,
+            first_instance,
         );
     }
     unsafe fn draw_indexed(
         &mut self,
-        start_index: u32,
+        first_index: u32,
         index_count: u32,
         base_vertex: i32,
-        start_instance: u32,
+        first_instance: u32,
         instance_count: u32,
     ) {
-        unsafe { self.prepare_draw(base_vertex, start_instance) };
+        unsafe { self.prepare_draw(base_vertex, first_instance) };
         self.list.as_ref().unwrap().draw_indexed(
             index_count,
             instance_count,
-            start_index,
+            first_index,
             base_vertex,
-            start_instance,
+            first_instance,
         );
     }
     unsafe fn draw_indirect(
diff --git a/wgpu-hal/src/dx12/device.rs b/wgpu-hal/src/dx12/device.rs
index 06e1b59a21..649fdb4f8b 100644
--- a/wgpu-hal/src/dx12/device.rs
+++ b/wgpu-hal/src/dx12/device.rs
@@ -983,7 +983,7 @@ impl crate::Device<super::Api> for super::Device {
         debug_assert_eq!(ranges.len(), total_non_dynamic_entries);
 
         let (special_constants_root_index, special_constants_binding) = if desc.flags.intersects(
-            crate::PipelineLayoutFlags::BASE_VERTEX_INSTANCE
+            crate::PipelineLayoutFlags::FIRST_VERTEX_INSTANCE
                 | crate::PipelineLayoutFlags::NUM_WORK_GROUPS,
         ) {
             let parameter_index = parameters.len();
@@ -991,7 +991,7 @@ impl crate::Device<super::Api> for super::Device {
             parameters.push(d3d12::RootParameter::constants(
                 d3d12::ShaderVisibility::All, // really needed for VS and CS only
                 native_binding(&bind_cbv),
-                3, // 0 = base vertex, 1 = base instance, 2 = other
+                3, // 0 = first_vertex, 1 = first_instance, 2 = other
             ));
             let binding = bind_cbv.clone();
             bind_cbv.register += 1;
diff --git a/wgpu-hal/src/dx12/mod.rs b/wgpu-hal/src/dx12/mod.rs
index 0de992cf91..c50b0af165 100644
--- a/wgpu-hal/src/dx12/mod.rs
+++ b/wgpu-hal/src/dx12/mod.rs
@@ -291,8 +291,8 @@ enum RootElement {
     Empty,
     Constant,
     SpecialConstantBuffer {
-        base_vertex: i32,
-        base_instance: u32,
+        first_vertex: i32,
+        first_instance: u32,
         other: u32,
     },
     /// Descriptor table.
diff --git a/wgpu-hal/src/empty.rs b/wgpu-hal/src/empty.rs
index a2a8c57c99..487d317870 100644
--- a/wgpu-hal/src/empty.rs
+++ b/wgpu-hal/src/empty.rs
@@ -353,18 +353,18 @@ impl crate::CommandEncoder<Api> for Encoder {
 
     unsafe fn draw(
         &mut self,
-        start_vertex: u32,
+        first_vertex: u32,
         vertex_count: u32,
-        start_instance: u32,
+        first_instance: u32,
         instance_count: u32,
     ) {
     }
     unsafe fn draw_indexed(
         &mut self,
-        start_index: u32,
+        first_index: u32,
         index_count: u32,
         base_vertex: i32,
-        start_instance: u32,
+        first_instance: u32,
         instance_count: u32,
     ) {
     }
diff --git a/wgpu-hal/src/gles/adapter.rs b/wgpu-hal/src/gles/adapter.rs
index 34e61ce915..c6ca1b49fa 100644
--- a/wgpu-hal/src/gles/adapter.rs
+++ b/wgpu-hal/src/gles/adapter.rs
@@ -379,7 +379,8 @@ impl super::Adapter {
         let mut downlevel_flags = wgt::DownlevelFlags::empty()
             | wgt::DownlevelFlags::NON_POWER_OF_TWO_MIPMAPPED_TEXTURES
             | wgt::DownlevelFlags::CUBE_ARRAY_TEXTURES
-            | wgt::DownlevelFlags::COMPARISON_SAMPLERS;
+            | wgt::DownlevelFlags::COMPARISON_SAMPLERS
+            | wgt::DownlevelFlags::VERTEX_AND_INSTANCE_INDEX_RESPECTS_RESPECTIVE_FIRST_VALUE_IN_INDIRECT_DRAW;
         downlevel_flags.set(wgt::DownlevelFlags::COMPUTE_SHADERS, supports_compute);
         downlevel_flags.set(
             wgt::DownlevelFlags::FRAGMENT_WRITABLE_STORAGE,
@@ -389,8 +390,6 @@ impl super::Adapter {
             wgt::DownlevelFlags::INDIRECT_EXECUTION,
             supported((3, 1), (4, 3)) || extensions.contains("GL_ARB_multi_draw_indirect"),
         );
-        //TODO: we can actually support positive `base_vertex` in the same way
-        // as we emulate the `start_instance`. But we can't deal with negatives...
         downlevel_flags.set(wgt::DownlevelFlags::BASE_VERTEX, supported((3, 2), (3, 2)));
         downlevel_flags.set(
             wgt::DownlevelFlags::INDEPENDENT_BLEND,
@@ -616,6 +615,21 @@ impl super::Adapter {
             super::PrivateCapabilities::INVALIDATE_FRAMEBUFFER,
             supported((3, 0), (4, 3)),
         );
+        if let Some(full_ver) = full_ver {
+            let supported =
+                full_ver >= (4, 2) && extensions.contains("GL_ARB_shader_draw_parameters");
+            private_caps.set(
+                super::PrivateCapabilities::FULLY_FEATURED_INSTANCING,
+                supported,
+            );
+            // Desktop 4.2 and greater specify the first instance parameter.
+            //
+            // For all other versions, the behavior is undefined.
+            //
+            // We only support indirect first instance when we also have ARB_shader_draw_parameters as
+            // that's the only way to get gl_InstanceID to work correctly.
+            features.set(wgt::Features::INDIRECT_FIRST_INSTANCE, supported);
+        }
 
         let max_texture_size = unsafe { gl.get_parameter_i32(glow::MAX_TEXTURE_SIZE) } as u32;
         let max_texture_3d_size = unsafe { gl.get_parameter_i32(glow::MAX_3D_TEXTURE_SIZE) } as u32;
diff --git a/wgpu-hal/src/gles/command.rs b/wgpu-hal/src/gles/command.rs
index abbbe8d427..133e99598e 100644
--- a/wgpu-hal/src/gles/command.rs
+++ b/wgpu-hal/src/gles/command.rs
@@ -29,6 +29,7 @@ pub(super) struct State {
     instance_vbuf_mask: usize,
     dirty_vbuf_mask: usize,
     active_first_instance: u32,
+    first_instance_location: Option<glow::UniformLocation>,
     push_constant_descs: ArrayVec<super::PushConstantDesc, { super::MAX_PUSH_CONSTANT_COMMANDS }>,
     // The current state of the push constant data block.
     current_push_constant_data: [u32; super::MAX_PUSH_CONSTANTS],
@@ -57,6 +58,7 @@ impl Default for State {
             instance_vbuf_mask: Default::default(),
             dirty_vbuf_mask: Default::default(),
             active_first_instance: Default::default(),
+            first_instance_location: Default::default(),
             push_constant_descs: Default::default(),
             current_push_constant_data: [0; super::MAX_PUSH_CONSTANTS],
             end_of_pass_timestamp: Default::default(),
@@ -193,19 +195,32 @@ impl super::CommandEncoder {
     }
 
     fn prepare_draw(&mut self, first_instance: u32) {
-        if first_instance != self.state.active_first_instance {
+        // If we support fully featured instancing, we want to bind everything as normal
+        // and let the draw call sort it out.
+        let emulated_first_instance_value = if self
+            .private_caps
+            .contains(super::PrivateCapabilities::FULLY_FEATURED_INSTANCING)
+        {
+            0
+        } else {
+            first_instance
+        };
+
+        if emulated_first_instance_value != self.state.active_first_instance {
             // rebind all per-instance buffers on first-instance change
             self.state.dirty_vbuf_mask |= self.state.instance_vbuf_mask;
-            self.state.active_first_instance = first_instance;
+            self.state.active_first_instance = emulated_first_instance_value;
         }
         if self.state.dirty_vbuf_mask != 0 {
-            self.rebind_vertex_data(first_instance);
+            self.rebind_vertex_data(emulated_first_instance_value);
         }
     }
 
+    #[allow(clippy::clone_on_copy)] // False positive when cloning glow::UniformLocation
     fn set_pipeline_inner(&mut self, inner: &super::PipelineInner) {
         self.cmd_buffer.commands.push(C::SetProgram(inner.program));
 
+        self.state.first_instance_location = inner.first_instance_location.clone();
         self.state.push_constant_descs = inner.push_constant_descs.clone();
 
         // rebind textures, if needed
@@ -1002,40 +1017,46 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
 
     unsafe fn draw(
         &mut self,
-        start_vertex: u32,
+        first_vertex: u32,
         vertex_count: u32,
-        start_instance: u32,
+        first_instance: u32,
         instance_count: u32,
     ) {
-        self.prepare_draw(start_instance);
+        self.prepare_draw(first_instance);
+        #[allow(clippy::clone_on_copy)] // False positive when cloning glow::UniformLocation
         self.cmd_buffer.commands.push(C::Draw {
             topology: self.state.topology,
-            start_vertex,
+            first_vertex,
             vertex_count,
+            first_instance,
             instance_count,
+            first_instance_location: self.state.first_instance_location.clone(),
         });
     }
     unsafe fn draw_indexed(
         &mut self,
-        start_index: u32,
+        first_index: u32,
         index_count: u32,
         base_vertex: i32,
-        start_instance: u32,
+        first_instance: u32,
         instance_count: u32,
     ) {
-        self.prepare_draw(start_instance);
+        self.prepare_draw(first_instance);
         let (index_size, index_type) = match self.state.index_format {
             wgt::IndexFormat::Uint16 => (2, glow::UNSIGNED_SHORT),
             wgt::IndexFormat::Uint32 => (4, glow::UNSIGNED_INT),
         };
-        let index_offset = self.state.index_offset + index_size * start_index as wgt::BufferAddress;
+        let index_offset = self.state.index_offset + index_size * first_index as wgt::BufferAddress;
+        #[allow(clippy::clone_on_copy)] // False positive when cloning glow::UniformLocation
         self.cmd_buffer.commands.push(C::DrawIndexed {
             topology: self.state.topology,
             index_type,
             index_offset,
             index_count,
             base_vertex,
+            first_instance,
             instance_count,
+            first_instance_location: self.state.first_instance_location.clone(),
         });
     }
     unsafe fn draw_indirect(
@@ -1048,10 +1069,12 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
         for draw in 0..draw_count as wgt::BufferAddress {
             let indirect_offset =
                 offset + draw * mem::size_of::<wgt::DrawIndirectArgs>() as wgt::BufferAddress;
+            #[allow(clippy::clone_on_copy)] // False positive when cloning glow::UniformLocation
             self.cmd_buffer.commands.push(C::DrawIndirect {
                 topology: self.state.topology,
                 indirect_buf: buffer.raw.unwrap(),
                 indirect_offset,
+                first_instance_location: self.state.first_instance_location.clone(),
             });
         }
     }
@@ -1069,11 +1092,13 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
         for draw in 0..draw_count as wgt::BufferAddress {
             let indirect_offset = offset
                 + draw * mem::size_of::<wgt::DrawIndexedIndirectArgs>() as wgt::BufferAddress;
+            #[allow(clippy::clone_on_copy)] // False positive when cloning glow::UniformLocation
             self.cmd_buffer.commands.push(C::DrawIndexedIndirect {
                 topology: self.state.topology,
                 index_type,
                 indirect_buf: buffer.raw.unwrap(),
                 indirect_offset,
+                first_instance_location: self.state.first_instance_location.clone(),
             });
         }
     }
diff --git a/wgpu-hal/src/gles/device.rs b/wgpu-hal/src/gles/device.rs
index 64f47788eb..62a29be092 100644
--- a/wgpu-hal/src/gles/device.rs
+++ b/wgpu-hal/src/gles/device.rs
@@ -467,9 +467,17 @@ impl super::Device {
             }
         }
 
+        let first_instance_location = if has_stages.contains(wgt::ShaderStages::VERTEX) {
+            // If this returns none (the uniform isn't active), that's fine, we just won't set it.
+            unsafe { gl.get_uniform_location(program, naga::back::glsl::FIRST_INSTANCE_BINDING) }
+        } else {
+            None
+        };
+
         Ok(Arc::new(super::PipelineInner {
             program,
             sampler_map,
+            first_instance_location,
             push_constant_descs: uniforms,
         }))
     }
@@ -1082,6 +1090,12 @@ impl crate::Device<super::Api> for super::Device {
                 .private_caps
                 .contains(super::PrivateCapabilities::SHADER_TEXTURE_SHADOW_LOD),
         );
+        writer_flags.set(
+            glsl::WriterFlags::DRAW_PARAMETERS,
+            self.shared
+                .private_caps
+                .contains(super::PrivateCapabilities::FULLY_FEATURED_INSTANCING),
+        );
         // We always force point size to be written and it will be ignored by the driver if it's not a point list primitive.
         // https://github.com/gfx-rs/wgpu/pull/3440/files#r1095726950
         writer_flags.set(glsl::WriterFlags::FORCE_POINT_SIZE, true);
diff --git a/wgpu-hal/src/gles/mod.rs b/wgpu-hal/src/gles/mod.rs
index 7b9a694b7d..33b8470821 100644
--- a/wgpu-hal/src/gles/mod.rs
+++ b/wgpu-hal/src/gles/mod.rs
@@ -34,7 +34,7 @@ will be minimal, if any.
 
 Generally, vertex buffers are marked as dirty and lazily bound on draw.
 
-GLES3 doesn't support "base instance" semantics. However, it's easy to support,
+GLES3 doesn't support `first_instance` semantics. However, it's easy to support,
 since we are forced to do late binding anyway. We just adjust the offsets
 into the vertex data.
 
@@ -51,9 +51,34 @@ from the vertex data itself. This mostly matches WebGPU, however there is a catc
 `stride` needs to be specified with the data, not as a part of the layout.
 
 To address this, we invalidate the vertex buffers based on:
-  - whether or not `start_instance` is used
+  - whether or not `first_instance` is used
   - stride has changed
 
+## Handling of `base_vertex`, `first_instance`, and `first_vertex`
+
+Between indirect, the lack of `first_instance` semantics, and the availability of `gl_BaseInstance`
+in shaders, getting buffers and builtins to work correctly is a bit tricky.
+
+We never emulate `base_vertex` and gl_VertexID behaves as `@builtin(vertex_index)` does, so we
+never need to do anything about that.
+
+We always advertise support for `VERTEX_AND_INSTANCE_INDEX_RESPECTS_RESPECTIVE_FIRST_VALUE_IN_INDIRECT_DRAW`.
+
+### GL 4.2+ with ARB_shader_draw_parameters
+
+- `@builtin(instance_index)` translates to `gl_InstanceID + gl_BaseInstance`
+- We bind instance buffers without any offset emulation.
+- We advertise support for the `INDIRECT_FIRST_INSTANCE` feature.
+
+While we can theoretically have a card with 4.2+ support but without ARB_shader_draw_parameters,
+we don't bother with that combination.
+
+### GLES & GL 4.1
+
+- `@builtin(instance_index)` translates to `gl_InstanceID + naga_vs_first_instance`
+- We bind instance buffers with offset emulation.
+- We _do not_ advertise support for `INDIRECT_FIRST_INSTANCE` and cpu-side pretend the `first_instance` is 0 on indirect calls.
+
 */
 
 ///cbindgen:ignore
@@ -171,6 +196,10 @@ bitflags::bitflags! {
         const DEBUG_FNS = 1 << 13;
         /// Supports framebuffer invalidation.
         const INVALIDATE_FRAMEBUFFER = 1 << 14;
+        /// Indicates support for `glDrawElementsInstancedBaseVertexBaseInstance` and `ARB_shader_draw_parameters`
+        ///
+        /// When this is true, instance offset emulation via vertex buffer rebinding and a shader uniform will be disabled.
+        const FULLY_FEATURED_INSTANCING = 1 << 15;
     }
 }
 
@@ -516,6 +545,7 @@ type SamplerBindMap = [Option<u8>; MAX_TEXTURE_SLOTS];
 struct PipelineInner {
     program: glow::Program,
     sampler_map: SamplerBindMap,
+    first_instance_location: Option<glow::UniformLocation>,
     push_constant_descs: ArrayVec<PushConstantDesc, MAX_PUSH_CONSTANT_COMMANDS>,
 }
 
@@ -715,9 +745,11 @@ type InvalidatedAttachments = ArrayVec<u32, { crate::MAX_COLOR_ATTACHMENTS + 2 }
 enum Command {
     Draw {
         topology: u32,
-        start_vertex: u32,
+        first_vertex: u32,
         vertex_count: u32,
+        first_instance: u32,
         instance_count: u32,
+        first_instance_location: Option<glow::UniformLocation>,
     },
     DrawIndexed {
         topology: u32,
@@ -725,18 +757,22 @@ enum Command {
         index_count: u32,
         index_offset: wgt::BufferAddress,
         base_vertex: i32,
+        first_instance: u32,
         instance_count: u32,
+        first_instance_location: Option<glow::UniformLocation>,
     },
     DrawIndirect {
         topology: u32,
         indirect_buf: glow::Buffer,
         indirect_offset: wgt::BufferAddress,
+        first_instance_location: Option<glow::UniformLocation>,
     },
     DrawIndexedIndirect {
         topology: u32,
         index_type: u32,
         indirect_buf: glow::Buffer,
         indirect_offset: wgt::BufferAddress,
+        first_instance_location: Option<glow::UniformLocation>,
     },
     Dispatch([u32; 3]),
     DispatchIndirect {
@@ -914,6 +950,19 @@ impl fmt::Debug for CommandBuffer {
     }
 }
 
+#[cfg(all(
+    target_arch = "wasm32",
+    feature = "fragile-send-sync-non-atomic-wasm",
+    not(target_feature = "atomics")
+))]
+unsafe impl Sync for CommandBuffer {}
+#[cfg(all(
+    target_arch = "wasm32",
+    feature = "fragile-send-sync-non-atomic-wasm",
+    not(target_feature = "atomics")
+))]
+unsafe impl Send for CommandBuffer {}
+
 //TODO: we would have something like `Arc<typed_arena::Arena>`
 // here and in the command buffers. So that everything grows
 // inside the encoder and stays there until `reset_all`.
@@ -932,6 +981,19 @@ impl fmt::Debug for CommandEncoder {
     }
 }
 
+#[cfg(all(
+    target_arch = "wasm32",
+    feature = "fragile-send-sync-non-atomic-wasm",
+    not(target_feature = "atomics")
+))]
+unsafe impl Sync for CommandEncoder {}
+#[cfg(all(
+    target_arch = "wasm32",
+    feature = "fragile-send-sync-non-atomic-wasm",
+    not(target_feature = "atomics")
+))]
+unsafe impl Send for CommandEncoder {}
+
 #[cfg(not(all(target_arch = "wasm32", not(target_os = "emscripten"))))]
 fn gl_debug_message_callback(source: u32, gltype: u32, id: u32, severity: u32, message: &str) {
     let source_str = match source {
diff --git a/wgpu-hal/src/gles/queue.rs b/wgpu-hal/src/gles/queue.rs
index 04bc1725c7..23478df26c 100644
--- a/wgpu-hal/src/gles/queue.rs
+++ b/wgpu-hal/src/gles/queue.rs
@@ -160,20 +160,43 @@ impl super::Queue {
         match *command {
             C::Draw {
                 topology,
-                start_vertex,
+                first_vertex,
                 vertex_count,
                 instance_count,
+                first_instance,
+                ref first_instance_location,
             } => {
-                // Don't use `gl.draw_arrays` for `instance_count == 1`.
-                // Angle has a bug where it doesn't consider the instance divisor when `DYNAMIC_DRAW` is used in `draw_arrays`.
-                // See https://github.com/gfx-rs/wgpu/issues/3578
-                unsafe {
-                    gl.draw_arrays_instanced(
-                        topology,
-                        start_vertex as i32,
-                        vertex_count as i32,
-                        instance_count as i32,
-                    )
+                let supports_full_instancing = self
+                    .shared
+                    .private_caps
+                    .contains(PrivateCapabilities::FULLY_FEATURED_INSTANCING);
+
+                if supports_full_instancing {
+                    unsafe {
+                        gl.draw_arrays_instanced_base_instance(
+                            topology,
+                            first_vertex as i32,
+                            vertex_count as i32,
+                            instance_count as i32,
+                            first_instance,
+                        )
+                    }
+                } else {
+                    unsafe {
+                        gl.uniform_1_u32(first_instance_location.as_ref(), first_instance);
+                    }
+
+                    // Don't use `gl.draw_arrays` for `instance_count == 1`.
+                    // Angle has a bug where it doesn't consider the instance divisor when `DYNAMIC_DRAW` is used in `draw_arrays`.
+                    // See https://github.com/gfx-rs/wgpu/issues/3578
+                    unsafe {
+                        gl.draw_arrays_instanced(
+                            topology,
+                            first_vertex as i32,
+                            vertex_count as i32,
+                            instance_count as i32,
+                        )
+                    }
                 };
             }
             C::DrawIndexed {
@@ -182,38 +205,75 @@ impl super::Queue {
                 index_count,
                 index_offset,
                 base_vertex,
+                first_instance,
                 instance_count,
+                ref first_instance_location,
             } => {
                 match base_vertex {
-                    // Don't use `gl.draw_elements`/`gl.draw_elements_base_vertex` for `instance_count == 1`.
-                    // Angle has a bug where it doesn't consider the instance divisor when `DYNAMIC_DRAW` is used in `gl.draw_elements`/`gl.draw_elements_base_vertex`.
-                    // See https://github.com/gfx-rs/wgpu/issues/3578
-                    0 => unsafe {
-                        gl.draw_elements_instanced(
-                            topology,
-                            index_count as i32,
-                            index_type,
-                            index_offset as i32,
-                            instance_count as i32,
-                        )
-                    },
-                    _ => unsafe {
-                        gl.draw_elements_instanced_base_vertex(
-                            topology,
-                            index_count as _,
-                            index_type,
-                            index_offset as i32,
-                            instance_count as i32,
-                            base_vertex,
-                        )
-                    },
+                    0 => {
+                        unsafe {
+                            gl.uniform_1_u32(first_instance_location.as_ref(), first_instance)
+                        };
+
+                        unsafe {
+                            // Don't use `gl.draw_elements`/`gl.draw_elements_base_vertex` for `instance_count == 1`.
+                            // Angle has a bug where it doesn't consider the instance divisor when `DYNAMIC_DRAW` is used in `gl.draw_elements`/`gl.draw_elements_base_vertex`.
+                            // See https://github.com/gfx-rs/wgpu/issues/3578
+                            gl.draw_elements_instanced(
+                                topology,
+                                index_count as i32,
+                                index_type,
+                                index_offset as i32,
+                                instance_count as i32,
+                            )
+                        }
+                    }
+                    _ => {
+                        let supports_full_instancing = self
+                            .shared
+                            .private_caps
+                            .contains(PrivateCapabilities::FULLY_FEATURED_INSTANCING);
+
+                        if supports_full_instancing {
+                            unsafe {
+                                gl.draw_elements_instanced_base_vertex_base_instance(
+                                    topology,
+                                    index_count as i32,
+                                    index_type,
+                                    index_offset as i32,
+                                    instance_count as i32,
+                                    base_vertex,
+                                    first_instance,
+                                )
+                            }
+                        } else {
+                            unsafe {
+                                gl.uniform_1_u32(first_instance_location.as_ref(), first_instance)
+                            };
+
+                            // If we've gotten here, wgpu-core has already validated that this function exists via the DownlevelFlags::BASE_VERTEX feature.
+                            unsafe {
+                                gl.draw_elements_instanced_base_vertex(
+                                    topology,
+                                    index_count as _,
+                                    index_type,
+                                    index_offset as i32,
+                                    instance_count as i32,
+                                    base_vertex,
+                                )
+                            }
+                        }
+                    }
                 }
             }
             C::DrawIndirect {
                 topology,
                 indirect_buf,
                 indirect_offset,
+                ref first_instance_location,
             } => {
+                unsafe { gl.uniform_1_u32(first_instance_location.as_ref(), 0) };
+
                 unsafe { gl.bind_buffer(glow::DRAW_INDIRECT_BUFFER, Some(indirect_buf)) };
                 unsafe { gl.draw_arrays_indirect_offset(topology, indirect_offset as i32) };
             }
@@ -222,7 +282,10 @@ impl super::Queue {
                 index_type,
                 indirect_buf,
                 indirect_offset,
+                ref first_instance_location,
             } => {
+                unsafe { gl.uniform_1_u32(first_instance_location.as_ref(), 0) };
+
                 unsafe { gl.bind_buffer(glow::DRAW_INDIRECT_BUFFER, Some(indirect_buf)) };
                 unsafe {
                     gl.draw_elements_indirect_offset(topology, index_type, indirect_offset as i32)
diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs
index b0b1120dbb..6c2e3d6e2d 100644
--- a/wgpu-hal/src/lib.rs
+++ b/wgpu-hal/src/lib.rs
@@ -565,17 +565,17 @@ pub trait CommandEncoder<A: Api>: WasmNotSendSync + fmt::Debug {
 
     unsafe fn draw(
         &mut self,
-        start_vertex: u32,
+        first_vertex: u32,
         vertex_count: u32,
-        start_instance: u32,
+        first_instance: u32,
         instance_count: u32,
     );
     unsafe fn draw_indexed(
         &mut self,
-        start_index: u32,
+        first_index: u32,
         index_count: u32,
         base_vertex: i32,
-        start_instance: u32,
+        first_instance: u32,
         instance_count: u32,
     );
     unsafe fn draw_indirect(
@@ -623,8 +623,8 @@ bitflags!(
     /// Pipeline layout creation flags.
     #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
     pub struct PipelineLayoutFlags: u32 {
-        /// Include support for base vertex/instance drawing.
-        const BASE_VERTEX_INSTANCE = 1 << 0;
+        /// Include support for `first_vertex` / `first_instance` drawing.
+        const FIRST_VERTEX_INSTANCE = 1 << 0;
         /// Include support for num work groups builtin.
         const NUM_WORK_GROUPS = 1 << 1;
     }
diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs
index fe7122f425..99efa280a1 100644
--- a/wgpu-hal/src/metal/adapter.rs
+++ b/wgpu-hal/src/metal/adapter.rs
@@ -427,17 +427,17 @@ const BGR10A2_ALL: &[MTLFeatureSet] = &[
     MTLFeatureSet::macOS_GPUFamily2_v1,
 ];
 
-const BASE_INSTANCE_SUPPORT: &[MTLFeatureSet] = &[
+/// "Indirect draw & dispatch arguments" in the Metal feature set tables
+const INDIRECT_DRAW_DISPATCH_SUPPORT: &[MTLFeatureSet] = &[
     MTLFeatureSet::iOS_GPUFamily3_v1,
     MTLFeatureSet::tvOS_GPUFamily2_v1,
     MTLFeatureSet::macOS_GPUFamily1_v1,
 ];
 
-const BASE_VERTEX_INSTANCE_SUPPORT: &[MTLFeatureSet] = &[
-    MTLFeatureSet::iOS_GPUFamily3_v1,
-    MTLFeatureSet::tvOS_GPUFamily2_v1,
-    MTLFeatureSet::macOS_GPUFamily1_v1,
-];
+/// "Base vertex/instance drawing" in the Metal feature set tables
+///
+/// in our terms, `base_vertex` and `first_instance` must be 0
+const BASE_VERTEX_FIRST_INSTANCE_SUPPORT: &[MTLFeatureSet] = INDIRECT_DRAW_DISPATCH_SUPPORT;
 
 const TEXTURE_CUBE_ARRAY_SUPPORT: &[MTLFeatureSet] = &[
     MTLFeatureSet::iOS_GPUFamily4_v1,
@@ -600,8 +600,11 @@ impl super::PrivateCapabilities {
                 MUTABLE_COMPARISON_SAMPLER_SUPPORT,
             ),
             sampler_clamp_to_border: Self::supports_any(device, SAMPLER_CLAMP_TO_BORDER_SUPPORT),
-            base_instance: Self::supports_any(device, BASE_INSTANCE_SUPPORT),
-            base_vertex_instance_drawing: Self::supports_any(device, BASE_VERTEX_INSTANCE_SUPPORT),
+            indirect_draw_dispatch: Self::supports_any(device, INDIRECT_DRAW_DISPATCH_SUPPORT),
+            base_vertex_first_instance_drawing: Self::supports_any(
+                device,
+                BASE_VERTEX_FIRST_INSTANCE_SUPPORT,
+            ),
             dual_source_blending: Self::supports_any(device, DUAL_SOURCE_BLEND_SUPPORT),
             low_power: !os_is_mac || device.is_low_power(),
             headless: os_is_mac && device.is_headless(),
@@ -815,7 +818,6 @@ impl super::PrivateCapabilities {
         use wgt::Features as F;
 
         let mut features = F::empty()
-            | F::INDIRECT_FIRST_INSTANCE
             | F::MAPPABLE_PRIMARY_BUFFERS
             | F::VERTEX_WRITABLE_STORAGE
             | F::TEXTURE_ADAPTER_SPECIFIC_FORMAT_FEATURES
@@ -825,9 +827,12 @@ impl super::PrivateCapabilities {
             | F::TEXTURE_FORMAT_16BIT_NORM
             | F::SHADER_F16
             | F::DEPTH32FLOAT_STENCIL8
-            | F::MULTI_DRAW_INDIRECT
             | F::BGRA8UNORM_STORAGE;
 
+        features.set(
+            F::INDIRECT_FIRST_INSTANCE | F::MULTI_DRAW_INDIRECT,
+            self.indirect_draw_dispatch,
+        );
         features.set(
             F::TIMESTAMP_QUERY,
             self.timestamp_query_support
@@ -891,11 +896,20 @@ impl super::PrivateCapabilities {
             wgt::DownlevelFlags::CUBE_ARRAY_TEXTURES,
             self.texture_cube_array,
         );
-        //TODO: separate the mutable comparisons from immutable ones
+        // TODO: separate the mutable comparisons from immutable ones
         downlevel.flags.set(
             wgt::DownlevelFlags::COMPARISON_SAMPLERS,
             self.mutable_comparison_samplers,
         );
+        downlevel.flags.set(
+            wgt::DownlevelFlags::INDIRECT_EXECUTION,
+            self.indirect_draw_dispatch,
+        );
+        // TODO: add another flag for `first_instance`
+        downlevel.flags.set(
+            wgt::DownlevelFlags::BASE_VERTEX,
+            self.base_vertex_first_instance_drawing,
+        );
         downlevel
             .flags
             .set(wgt::DownlevelFlags::ANISOTROPIC_FILTERING, true);
diff --git a/wgpu-hal/src/metal/command.rs b/wgpu-hal/src/metal/command.rs
index 5196e0447d..b06f46e8a9 100644
--- a/wgpu-hal/src/metal/command.rs
+++ b/wgpu-hal/src/metal/command.rs
@@ -965,31 +965,31 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
 
     unsafe fn draw(
         &mut self,
-        start_vertex: u32,
+        first_vertex: u32,
         vertex_count: u32,
-        start_instance: u32,
+        first_instance: u32,
         instance_count: u32,
     ) {
         let encoder = self.state.render.as_ref().unwrap();
-        if start_instance != 0 {
+        if first_instance != 0 {
             encoder.draw_primitives_instanced_base_instance(
                 self.state.raw_primitive_type,
-                start_vertex as _,
+                first_vertex as _,
                 vertex_count as _,
                 instance_count as _,
-                start_instance as _,
+                first_instance as _,
             );
         } else if instance_count != 1 {
             encoder.draw_primitives_instanced(
                 self.state.raw_primitive_type,
-                start_vertex as _,
+                first_vertex as _,
                 vertex_count as _,
                 instance_count as _,
             );
         } else {
             encoder.draw_primitives(
                 self.state.raw_primitive_type,
-                start_vertex as _,
+                first_vertex as _,
                 vertex_count as _,
             );
         }
@@ -997,16 +997,16 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
 
     unsafe fn draw_indexed(
         &mut self,
-        start_index: u32,
+        first_index: u32,
         index_count: u32,
         base_vertex: i32,
-        start_instance: u32,
+        first_instance: u32,
         instance_count: u32,
     ) {
         let encoder = self.state.render.as_ref().unwrap();
         let index = self.state.index.as_ref().unwrap();
-        let offset = index.offset + index.stride * start_index as wgt::BufferAddress;
-        if base_vertex != 0 || start_instance != 0 {
+        let offset = index.offset + index.stride * first_index as wgt::BufferAddress;
+        if base_vertex != 0 || first_instance != 0 {
             encoder.draw_indexed_primitives_instanced_base_instance(
                 self.state.raw_primitive_type,
                 index_count as _,
@@ -1015,7 +1015,7 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
                 offset,
                 instance_count as _,
                 base_vertex as _,
-                start_instance as _,
+                first_instance as _,
             );
         } else if instance_count != 1 {
             encoder.draw_indexed_primitives_instanced(
diff --git a/wgpu-hal/src/metal/mod.rs b/wgpu-hal/src/metal/mod.rs
index 360c648daf..0ddf96ed4a 100644
--- a/wgpu-hal/src/metal/mod.rs
+++ b/wgpu-hal/src/metal/mod.rs
@@ -182,8 +182,8 @@ struct PrivateCapabilities {
     shared_textures: bool,
     mutable_comparison_samplers: bool,
     sampler_clamp_to_border: bool,
-    base_instance: bool,
-    base_vertex_instance_drawing: bool,
+    indirect_draw_dispatch: bool,
+    base_vertex_first_instance_drawing: bool,
     dual_source_blending: bool,
     low_power: bool,
     headless: bool,
diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs
index 6fcb7b7df6..5e6c2db1c3 100644
--- a/wgpu-hal/src/vulkan/adapter.rs
+++ b/wgpu-hal/src/vulkan/adapter.rs
@@ -331,7 +331,8 @@ impl PhysicalDeviceFeatures {
             | Df::INDIRECT_EXECUTION
             | Df::VIEW_FORMATS
             | Df::UNRESTRICTED_EXTERNAL_TEXTURE_COPIES
-            | Df::NONBLOCKING_QUERY_RESOLVE;
+            | Df::NONBLOCKING_QUERY_RESOLVE
+            | Df::VERTEX_AND_INSTANCE_INDEX_RESPECTS_RESPECTIVE_FIRST_VALUE_IN_INDIRECT_DRAW;
 
         dl_flags.set(
             Df::SURFACE_VIEW_FORMATS,
@@ -992,11 +993,15 @@ impl super::Instance {
 
         if let Some(driver) = phd_capabilities.driver {
             if driver.conformance_version.major == 0 {
-                log::warn!(
-                    "Adapter is not Vulkan compliant, hiding adapter: {}",
-                    info.name
-                );
-                return None;
+                if driver.driver_id == ash::vk::DriverId::MOLTENVK {
+                    log::debug!("Adapter is not Vulkan compliant, but is MoltenVK, continuing");
+                } else {
+                    log::warn!(
+                        "Adapter is not Vulkan compliant, hiding adapter: {}",
+                        info.name
+                    );
+                    return None;
+                }
             }
         }
         if phd_capabilities.device_api_version == vk::API_VERSION_1_0
diff --git a/wgpu-hal/src/vulkan/command.rs b/wgpu-hal/src/vulkan/command.rs
index 52226f34eb..c31da9e2c8 100644
--- a/wgpu-hal/src/vulkan/command.rs
+++ b/wgpu-hal/src/vulkan/command.rs
@@ -722,9 +722,9 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
 
     unsafe fn draw(
         &mut self,
-        start_vertex: u32,
+        first_vertex: u32,
         vertex_count: u32,
-        start_instance: u32,
+        first_instance: u32,
         instance_count: u32,
     ) {
         unsafe {
@@ -732,17 +732,17 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
                 self.active,
                 vertex_count,
                 instance_count,
-                start_vertex,
-                start_instance,
+                first_vertex,
+                first_instance,
             )
         };
     }
     unsafe fn draw_indexed(
         &mut self,
-        start_index: u32,
+        first_index: u32,
         index_count: u32,
         base_vertex: i32,
-        start_instance: u32,
+        first_instance: u32,
         instance_count: u32,
     ) {
         unsafe {
@@ -750,9 +750,9 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
                 self.active,
                 index_count,
                 instance_count,
-                start_index,
+                first_index,
                 base_vertex,
-                start_instance,
+                first_instance,
             )
         };
     }
diff --git a/wgpu-types/src/lib.rs b/wgpu-types/src/lib.rs
index 56a42b4ba7..c9582fb93f 100644
--- a/wgpu-types/src/lib.rs
+++ b/wgpu-types/src/lib.rs
@@ -280,12 +280,21 @@ bitflags::bitflags! {
         ///
         /// This is a web and native feature.
         const TIMESTAMP_QUERY = 1 << 1;
-        /// Allows non-zero value for the "first instance" in indirect draw calls.
+        /// Allows non-zero value for the `first_instance` member in indirect draw calls.
+        ///
+        /// If this feature is not enabled, and the `first_instance` member is non-zero, the behavior may be:
+        /// - The draw call is ignored.
+        /// - The draw call is executed as if the `first_instance` is zero.
+        /// - The draw call is executed with the correct `first_instance` value.
         ///
         /// Supported Platforms:
         /// - Vulkan (mostly)
         /// - DX12
-        /// - Metal
+        /// - Metal on Apple3+ or Mac1+
+        /// - OpenGL (Desktop 4.2+ with ARB_shader_draw_parameters only)
+        ///
+        /// Not Supported:
+        /// - OpenGL ES / WebGL
         ///
         /// This is a web and native feature.
         const INDIRECT_FIRST_INSTANCE = 1 << 2;
@@ -613,7 +622,7 @@ bitflags::bitflags! {
         /// Supported platforms:
         /// - DX12
         /// - Vulkan
-        /// - Metal (Emulated on top of `draw_indirect` and `draw_indexed_indirect`)
+        /// - Metal on Apple3+ or Mac1+ (Emulated on top of `draw_indirect` and `draw_indexed_indirect`)
         ///
         /// This is a native only feature.
         ///
@@ -1385,9 +1394,18 @@ bitflags::bitflags! {
         const FRAGMENT_WRITABLE_STORAGE = 1 << 1;
         /// Supports indirect drawing and dispatching.
         ///
-        /// DX11 on FL10 level hardware, WebGL2, and GLES 3.0 devices do not support indirect.
+        /// DX11 on FL10 level hardware, WebGL2, GLES 3.0, and Metal on Apple1/Apple2 GPUs do not support indirect.
         const INDIRECT_EXECUTION = 1 << 2;
-        /// Supports non-zero `base_vertex` parameter to indexed draw calls.
+        /// Supports non-zero `base_vertex` parameter to direct indexed draw calls.
+        ///
+        /// Indirect calls, if supported, always support non-zero `base_vertex`.
+        ///
+        /// Supported by:
+        /// - Vulkan
+        /// - DX12
+        /// - Metal on Apple3+ or Mac1+
+        /// - OpenGL 3.2+
+        /// - OpenGL ES 3.2
         const BASE_VERTEX = 1 << 3;
         /// Supports reading from a depth/stencil texture while using it as a read-only
         /// depth/stencil attachment.
@@ -1486,6 +1504,34 @@ bitflags::bitflags! {
         /// Not Supported by:
         /// - GL ES / WebGL
         const NONBLOCKING_QUERY_RESOLVE = 1 << 22;
+
+        /// If this is true, use of `@builtin(vertex_index)` and `@builtin(instance_index)` will properly take into consideration
+        /// the `first_vertex` and `first_instance` parameters of indirect draw calls.
+        ///
+        /// If this is false, `@builtin(vertex_index)` and `@builtin(instance_index)` will start by counting from 0, ignoring the
+        /// `first_vertex` and `first_instance` parameters.
+        ///
+        /// For example, if you had a draw call like this:
+        /// - `first_vertex: 4,`
+        /// - `vertex_count: 12,`
+        ///
+        /// When this flag is present, `@builtin(vertex_index)` will start at 4 and go up to 15 (12 invocations).
+        ///
+        /// When this flag is not present, `@builtin(vertex_index)` will start at 0 and go up to 11 (12 invocations).
+        ///
+        /// This only affects the builtins in the shaders,
+        /// vertex buffers and instance rate vertex buffers will behave like expected with this flag disabled.
+        ///
+        /// See also [`Features::`]
+        ///
+        /// Supported By:
+        /// - Vulkan
+        /// - Metal
+        /// - OpenGL
+        ///
+        /// Will be implemented in the future by:
+        /// - DX12 ([#2471](https://github.com/gfx-rs/wgpu/issues/2471))
+        const VERTEX_AND_INSTANCE_INDEX_RESPECTS_RESPECTIVE_FIRST_VALUE_IN_INDIRECT_DRAW = 1 << 23;
     }
 }
 
@@ -6507,44 +6553,84 @@ impl_bitflags!(PipelineStatisticsTypes);
 
 /// Argument buffer layout for draw_indirect commands.
 #[repr(C)]
-#[derive(Clone, Copy, Debug)]
+#[derive(Copy, Clone, Debug, Default)]
 pub struct DrawIndirectArgs {
     /// The number of vertices to draw.
     pub vertex_count: u32,
     /// The number of instances to draw.
     pub instance_count: u32,
-    /// Offset into the vertex buffers, in vertices, to begin drawing from.
+    /// The Index of the first vertex to draw.
     pub first_vertex: u32,
-    /// First instance to draw.
+    /// The instance ID of the first instance to draw.
+    ///
+    /// Has to be 0, unless [`Features::INDIRECT_FIRST_INSTANCE`](crate::Features::INDIRECT_FIRST_INSTANCE) is enabled.
     pub first_instance: u32,
 }
 
+impl DrawIndirectArgs {
+    /// Returns the bytes representation of the struct, ready to be written in a buffer.
+    pub fn as_bytes(&self) -> &[u8] {
+        unsafe {
+            std::mem::transmute(std::slice::from_raw_parts(
+                self as *const _ as *const u8,
+                std::mem::size_of::<Self>(),
+            ))
+        }
+    }
+}
+
 /// Argument buffer layout for draw_indexed_indirect commands.
 #[repr(C)]
-#[derive(Clone, Copy, Debug)]
+#[derive(Copy, Clone, Debug, Default)]
 pub struct DrawIndexedIndirectArgs {
     /// The number of indices to draw.
     pub index_count: u32,
     /// The number of instances to draw.
     pub instance_count: u32,
-    /// Offset into the index buffer, in indices, begin drawing from.
+    /// The first index within the index buffer.
     pub first_index: u32,
-    /// Added to each index value before indexing into the vertex buffers.
+    /// The value added to the vertex index before indexing into the vertex buffer.
     pub base_vertex: i32,
-    /// First instance to draw.
+    /// The instance ID of the first instance to draw.
+    ///
+    /// Has to be 0, unless [`Features::INDIRECT_FIRST_INSTANCE`](crate::Features::INDIRECT_FIRST_INSTANCE) is enabled.
     pub first_instance: u32,
 }
 
+impl DrawIndexedIndirectArgs {
+    /// Returns the bytes representation of the struct, ready to be written in a buffer.
+    pub fn as_bytes(&self) -> &[u8] {
+        unsafe {
+            std::mem::transmute(std::slice::from_raw_parts(
+                self as *const _ as *const u8,
+                std::mem::size_of::<Self>(),
+            ))
+        }
+    }
+}
+
 /// Argument buffer layout for dispatch_indirect commands.
 #[repr(C)]
-#[derive(Clone, Copy, Debug)]
+#[derive(Copy, Clone, Debug, Default)]
 pub struct DispatchIndirectArgs {
-    /// X dimension of the grid of workgroups to dispatch.
-    pub group_size_x: u32,
-    /// Y dimension of the grid of workgroups to dispatch.
-    pub group_size_y: u32,
-    /// Z dimension of the grid of workgroups to dispatch.
-    pub group_size_z: u32,
+    /// The number of work groups in X dimension.
+    pub x: u32,
+    /// The number of work groups in Y dimension.
+    pub y: u32,
+    /// The number of work groups in Z dimension.
+    pub z: u32,
+}
+
+impl DispatchIndirectArgs {
+    /// Returns the bytes representation of the struct, ready to be written into a buffer.
+    pub fn as_bytes(&self) -> &[u8] {
+        unsafe {
+            std::mem::transmute(std::slice::from_raw_parts(
+                self as *const _ as *const u8,
+                std::mem::size_of::<Self>(),
+            ))
+        }
+    }
 }
 
 /// Describes how shader bound checks should be performed.
diff --git a/wgpu/src/lib.rs b/wgpu/src/lib.rs
index ffab396c85..73f13fb965 100644
--- a/wgpu/src/lib.rs
+++ b/wgpu/src/lib.rs
@@ -3875,6 +3875,35 @@ impl<'a> RenderPass<'a> {
         );
     }
 
+    /// Inserts debug marker.
+    pub fn insert_debug_marker(&mut self, label: &str) {
+        DynContext::render_pass_insert_debug_marker(
+            &*self.parent.context,
+            &mut self.id,
+            self.data.as_mut(),
+            label,
+        );
+    }
+
+    /// Start record commands and group it into debug marker group.
+    pub fn push_debug_group(&mut self, label: &str) {
+        DynContext::render_pass_push_debug_group(
+            &*self.parent.context,
+            &mut self.id,
+            self.data.as_mut(),
+            label,
+        );
+    }
+
+    /// Stops command recording and creates debug group.
+    pub fn pop_debug_group(&mut self) {
+        DynContext::render_pass_pop_debug_group(
+            &*self.parent.context,
+            &mut self.id,
+            self.data.as_mut(),
+        );
+    }
+
     /// Draws primitives from the active vertex buffer(s).
     ///
     /// The active vertex buffer(s) can be set with [`RenderPass::set_vertex_buffer`].
@@ -3906,35 +3935,6 @@ impl<'a> RenderPass<'a> {
         )
     }
 
-    /// Inserts debug marker.
-    pub fn insert_debug_marker(&mut self, label: &str) {
-        DynContext::render_pass_insert_debug_marker(
-            &*self.parent.context,
-            &mut self.id,
-            self.data.as_mut(),
-            label,
-        );
-    }
-
-    /// Start record commands and group it into debug marker group.
-    pub fn push_debug_group(&mut self, label: &str) {
-        DynContext::render_pass_push_debug_group(
-            &*self.parent.context,
-            &mut self.id,
-            self.data.as_mut(),
-            label,
-        );
-    }
-
-    /// Stops command recording and creates debug group.
-    pub fn pop_debug_group(&mut self) {
-        DynContext::render_pass_pop_debug_group(
-            &*self.parent.context,
-            &mut self.id,
-            self.data.as_mut(),
-        );
-    }
-
     /// Draws indexed primitives using the active index buffer and the active vertex buffers.
     ///
     /// The active index buffer can be set with [`RenderPass::set_index_buffer`]
@@ -3972,12 +3972,17 @@ impl<'a> RenderPass<'a> {
 
     /// Draws primitives from the active vertex buffer(s) based on the contents of the `indirect_buffer`.
     ///
-    /// The active vertex buffers can be set with [`RenderPass::set_vertex_buffer`].
+    /// This is like calling [`RenderPass::draw`] but the contents of the call are specified in the `indirect_buffer`.
+    /// The structure expected in `indirect_buffer` must conform to [`DrawIndirectArgs`](crate::util::DrawIndirectArgs).
     ///
-    /// The structure expected in `indirect_buffer` must conform to [`DrawIndirect`](crate::util::DrawIndirect).
+    /// Indirect drawing has some caviats depending on the features available. We are not currently able to validate
+    /// these and issue an error.
+    /// - If [`Features::INDIRECT_FIRST_INSTANCE`] is not present on the adapter,
+    ///   [`DrawIndirect::first_instance`](crate::util::DrawIndirectArgs::first_instance) will be ignored.
+    /// - If [`DownlevelFlags::VERTEX_AND_INSTANCE_INDEX_RESPECTS_RESPECTIVE_FIRST_VALUE_IN_INDIRECT_DRAW`] is not present on the adapter,
+    ///   any use of `@builtin(vertex_index)` or `@builtin(instance_index)` in the vertex shader will have different values.
     ///
-    /// This drawing command uses the current render state, as set by preceding `set_*()` methods.
-    /// It is not affected by changes to the state that are performed after it is called.
+    /// See details on the individual flags for more information.
     pub fn draw_indirect(&mut self, indirect_buffer: &'a Buffer, indirect_offset: BufferAddress) {
         DynContext::render_pass_draw_indirect(
             &*self.parent.context,
@@ -3992,13 +3997,17 @@ impl<'a> RenderPass<'a> {
     /// Draws indexed primitives using the active index buffer and the active vertex buffers,
     /// based on the contents of the `indirect_buffer`.
     ///
-    /// The active index buffer can be set with [`RenderPass::set_index_buffer`], while the active
-    /// vertex buffers can be set with [`RenderPass::set_vertex_buffer`].
+    /// This is like calling [`RenderPass::draw_indexed`] but the contents of the call are specified in the `indirect_buffer`.
+    /// The structure expected in `indirect_buffer` must conform to [`DrawIndexedIndirectArgs`](crate::util::DrawIndexedIndirectArgs).
     ///
-    /// The structure expected in `indirect_buffer` must conform to [`DrawIndexedIndirect`](crate::util::DrawIndexedIndirect).
+    /// Indirect drawing has some caviats depending on the features available. We are not currently able to validate
+    /// these and issue an error.
+    /// - If [`Features::INDIRECT_FIRST_INSTANCE`] is not present on the adapter,
+    ///   [`DrawIndexedIndirect::first_instance`](crate::util::DrawIndexedIndirectArgs::first_instance) will be ignored.
+    /// - If [`DownlevelFlags::VERTEX_AND_INSTANCE_INDEX_RESPECTS_RESPECTIVE_FIRST_VALUE_IN_INDIRECT_DRAW`] is not present on the adapter,
+    ///   any use of `@builtin(vertex_index)` or `@builtin(instance_index)` in the vertex shader will have different values.
     ///
-    /// This drawing command uses the current render state, as set by preceding `set_*()` methods.
-    /// It is not affected by changes to the state that are performed after it is called.
+    /// See details on the individual flags for more information.
     pub fn draw_indexed_indirect(
         &mut self,
         indirect_buffer: &'a Buffer,
@@ -4043,7 +4052,7 @@ impl<'a> RenderPass<'a> {
     ///
     /// The active vertex buffers can be set with [`RenderPass::set_vertex_buffer`].
     ///
-    /// The structure expected in `indirect_buffer` must conform to [`DrawIndirect`](crate::util::DrawIndirect).
+    /// The structure expected in `indirect_buffer` must conform to [`DrawIndirectArgs`](crate::util::DrawIndirectArgs).
     /// These draw structures are expected to be tightly packed.
     ///
     /// This drawing command uses the current render state, as set by preceding `set_*()` methods.
@@ -4071,7 +4080,7 @@ impl<'a> RenderPass<'a> {
     /// The active index buffer can be set with [`RenderPass::set_index_buffer`], while the active
     /// vertex buffers can be set with [`RenderPass::set_vertex_buffer`].
     ///
-    /// The structure expected in `indirect_buffer` must conform to [`DrawIndexedIndirect`](crate::util::DrawIndexedIndirect).
+    /// The structure expected in `indirect_buffer` must conform to [`DrawIndexedIndirectArgs`](crate::util::DrawIndexedIndirectArgs).
     /// These draw structures are expected to be tightly packed.
     ///
     /// This drawing command uses the current render state, as set by preceding `set_*()` methods.
@@ -4104,7 +4113,7 @@ impl<'a> RenderPass<'a> {
     ///
     /// The active vertex buffers can be set with [`RenderPass::set_vertex_buffer`].
     ///
-    /// The structure expected in `indirect_buffer` must conform to [`DrawIndirect`](crate::util::DrawIndirect).
+    /// The structure expected in `indirect_buffer` must conform to [`DrawIndirectArgs`](crate::util::DrawIndirectArgs).
     /// These draw structures are expected to be tightly packed.
     ///
     /// The structure expected in `count_buffer` is the following:
@@ -4150,7 +4159,7 @@ impl<'a> RenderPass<'a> {
     /// vertex buffers can be set with [`RenderPass::set_vertex_buffer`].
     ///
     ///
-    /// The structure expected in `indirect_buffer` must conform to [`DrawIndexedIndirect`](crate::util::DrawIndexedIndirect).
+    /// The structure expected in `indirect_buffer` must conform to [`DrawIndexedIndirectArgs`](crate::util::DrawIndexedIndirectArgs).
     ///
     /// These draw structures are expected to be tightly packed.
     ///
@@ -4406,7 +4415,7 @@ impl<'a> ComputePass<'a> {
 
     /// Dispatches compute work operations, based on the contents of the `indirect_buffer`.
     ///
-    /// The structure expected in `indirect_buffer` must conform to [`DispatchIndirect`](crate::util::DispatchIndirect).
+    /// The structure expected in `indirect_buffer` must conform to [`DispatchIndirectArgs`](crate::util::DispatchIndirectArgs).
     pub fn dispatch_workgroups_indirect(
         &mut self,
         indirect_buffer: &'a Buffer,
@@ -4654,7 +4663,7 @@ impl<'a> RenderBundleEncoder<'a> {
     ///
     /// The active vertex buffers can be set with [`RenderBundleEncoder::set_vertex_buffer`].
     ///
-    /// The structure expected in `indirect_buffer` must conform to [`DrawIndirect`](crate::util::DrawIndirect).
+    /// The structure expected in `indirect_buffer` must conform to [`DrawIndirectArgs`](crate::util::DrawIndirectArgs).
     pub fn draw_indirect(&mut self, indirect_buffer: &'a Buffer, indirect_offset: BufferAddress) {
         DynContext::render_bundle_encoder_draw_indirect(
             &*self.parent.context,
@@ -4672,7 +4681,7 @@ impl<'a> RenderBundleEncoder<'a> {
     /// The active index buffer can be set with [`RenderBundleEncoder::set_index_buffer`], while the active
     /// vertex buffers can be set with [`RenderBundleEncoder::set_vertex_buffer`].
     ///
-    /// The structure expected in `indirect_buffer` must conform to [`DrawIndexedIndirect`](crate::util::DrawIndexedIndirect).
+    /// The structure expected in `indirect_buffer` must conform to [`DrawIndexedIndirectArgs`](crate::util::DrawIndexedIndirectArgs).
     pub fn draw_indexed_indirect(
         &mut self,
         indirect_buffer: &'a Buffer,
diff --git a/wgpu/src/util/encoder.rs b/wgpu/src/util/encoder.rs
index 208c8c0a51..bef8fe9509 100644
--- a/wgpu/src/util/encoder.rs
+++ b/wgpu/src/util/encoder.rs
@@ -50,7 +50,7 @@ pub trait RenderEncoder<'a> {
     ///
     /// The active vertex buffers can be set with [`RenderEncoder::set_vertex_buffer`].
     ///
-    /// The structure expected in `indirect_buffer` must conform to [`DrawIndirect`](crate::util::DrawIndirect).
+    /// The structure expected in `indirect_buffer` must conform to [`DrawIndirectArgs`](crate::util::DrawIndirectArgs).
     fn draw_indirect(&mut self, indirect_buffer: &'a Buffer, indirect_offset: BufferAddress);
 
     /// Draws indexed primitives using the active index buffer and the active vertex buffers,
@@ -59,7 +59,7 @@ pub trait RenderEncoder<'a> {
     /// The active index buffer can be set with [`RenderEncoder::set_index_buffer`], while the active
     /// vertex buffers can be set with [`RenderEncoder::set_vertex_buffer`].
     ///
-    /// The structure expected in `indirect_buffer` must conform to [`DrawIndexedIndirect`](crate::util::DrawIndexedIndirect).
+    /// The structure expected in `indirect_buffer` must conform to [`DrawIndexedIndirectArgs`](crate::util::DrawIndexedIndirectArgs).
     fn draw_indexed_indirect(
         &mut self,
         indirect_buffer: &'a Buffer,
diff --git a/wgpu/src/util/indirect.rs b/wgpu/src/util/indirect.rs
deleted file mode 100644
index 36e6e70633..0000000000
--- a/wgpu/src/util/indirect.rs
+++ /dev/null
@@ -1,81 +0,0 @@
-/// The structure expected in `indirect_buffer` for [`RenderEncoder::draw_indirect`](crate::util::RenderEncoder::draw_indirect).
-#[repr(C)]
-#[derive(Copy, Clone, Debug, Default)]
-pub struct DrawIndirect {
-    /// The number of vertices to draw.
-    pub vertex_count: u32,
-    /// The number of instances to draw.
-    pub instance_count: u32,
-    /// The Index of the first vertex to draw.
-    pub base_vertex: u32,
-    /// The instance ID of the first instance to draw.
-    /// Has to be 0, unless [`Features::INDIRECT_FIRST_INSTANCE`](crate::Features::INDIRECT_FIRST_INSTANCE) is enabled.
-    pub base_instance: u32,
-}
-
-impl DrawIndirect {
-    /// Returns the bytes representation of the struct, ready to be written in a [`Buffer`](crate::Buffer).
-    pub fn as_bytes(&self) -> &[u8] {
-        unsafe {
-            std::mem::transmute(std::slice::from_raw_parts(
-                self as *const _ as *const u8,
-                std::mem::size_of::<Self>(),
-            ))
-        }
-    }
-}
-
-/// The structure expected in `indirect_buffer` for [`RenderEncoder::draw_indexed_indirect`](crate::util::RenderEncoder::draw_indexed_indirect).
-#[repr(C)]
-#[derive(Copy, Clone, Debug, Default)]
-pub struct DrawIndexedIndirect {
-    /// The number of vertices to draw.
-    pub vertex_count: u32,
-    /// The number of instances to draw.
-    pub instance_count: u32,
-    /// The base index within the index buffer.
-    pub base_index: u32,
-    /// The value added to the vertex index before indexing into the vertex buffer.
-    pub vertex_offset: i32,
-    /// The instance ID of the first instance to draw.
-    /// Has to be 0, unless [`Features::INDIRECT_FIRST_INSTANCE`](crate::Features::INDIRECT_FIRST_INSTANCE) is enabled.
-    pub base_instance: u32,
-}
-
-impl DrawIndexedIndirect {
-    /// Returns the bytes representation of the struct, ready to be written in a [`Buffer`](crate::Buffer).
-    pub fn as_bytes(&self) -> &[u8] {
-        unsafe {
-            std::mem::transmute(std::slice::from_raw_parts(
-                self as *const _ as *const u8,
-                std::mem::size_of::<Self>(),
-            ))
-        }
-    }
-}
-
-/// The structure expected in `indirect_buffer` for [`ComputePass::dispatch_workgroups_indirect`](crate::ComputePass::dispatch_workgroups_indirect).
-///
-/// x, y and z denote the number of work groups to dispatch in each dimension.
-#[repr(C)]
-#[derive(Copy, Clone, Debug, Default)]
-pub struct DispatchIndirect {
-    /// The number of work groups in X dimension.
-    pub x: u32,
-    /// The number of work groups in Y dimension.
-    pub y: u32,
-    /// The number of work groups in Z dimension.
-    pub z: u32,
-}
-
-impl DispatchIndirect {
-    /// Returns the bytes representation of the struct, ready to be written in a [`Buffer`](crate::Buffer).
-    pub fn as_bytes(&self) -> &[u8] {
-        unsafe {
-            std::mem::transmute(std::slice::from_raw_parts(
-                self as *const _ as *const u8,
-                std::mem::size_of::<Self>(),
-            ))
-        }
-    }
-}
diff --git a/wgpu/src/util/mod.rs b/wgpu/src/util/mod.rs
index 98b6155f15..8c52a8b7cc 100644
--- a/wgpu/src/util/mod.rs
+++ b/wgpu/src/util/mod.rs
@@ -6,7 +6,6 @@
 mod belt;
 mod device;
 mod encoder;
-mod indirect;
 mod init;
 
 use std::sync::Arc;
@@ -19,9 +18,8 @@ use std::{
 pub use belt::StagingBelt;
 pub use device::{BufferInitDescriptor, DeviceExt};
 pub use encoder::RenderEncoder;
-pub use indirect::*;
 pub use init::*;
-pub use wgt::math::*;
+pub use wgt::{math::*, DispatchIndirectArgs, DrawIndexedIndirectArgs, DrawIndirectArgs};
 
 /// Treat the given byte slice as a SPIR-V module.
 ///